TensorRT-LLMs/0.20.0rc1/python-api/tensorrt_llm.runtime.html
Shi Xiaowei 5e2cf02f46
Update gh-pages (#4284)
update docs for 0.20.0rc2

Signed-off-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
2025-05-14 11:12:52 +08:00

3273 lines
409 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Runtime &#8212; TensorRT-LLM</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!--
this give us a css class that will be invisible only if js is disabled
-->
<noscript>
<style>
.pst-js-only { display: none !important; }
</style>
</noscript>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
<!-- So that users can add custom icons -->
<script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=65e89d2a"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'python-api/tensorrt_llm.runtime';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
<link rel="icon" href="../_static/favicon.png"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Executor" href="../_cpp_gen/executor.html" />
<link rel="prev" title="Quantization" href="tensorrt_llm.quantization.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.20.0rc1" />
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<dialog id="pst-search-dialog">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
</dialog>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
<button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
<span class="fa-solid fa-bars"></span>
</button>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
<img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
<p class="title logo__title">TensorRT-LLM</p>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<div class="version-switcher__container dropdown pst-js-only">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
<span class="fa-solid fa-outdent"></span>
</button>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<dialog id="pst-primary-sidebar-modal"></dialog>
<div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
<img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
<p class="title logo__title">TensorRT-LLM</p>
</a>
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<div class="version-switcher__container dropdown pst-js-only">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Table of Contents">
<p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
<div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="../torch.html">PyTorch Backend</a></li>
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../examples/customization.html">LLM Common Customizations</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
</ul>
</details></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client.html">Curl Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Runtime</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb" class="d-print-none">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Runtime</span></li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="module-tensorrt_llm">
<span id="runtime"></span><h1>Runtime<a class="headerlink" href="#module-tensorrt_llm" title="Link to this heading">#</a></h1>
<dl class="py class" id="module-tensorrt_llm.runtime">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ChatGLMGenerationSession">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">ChatGLMGenerationSession</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">model_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.ModelConfig" title="tensorrt_llm.runtime.generation.ModelConfig"><span class="pre">ModelConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_buffer</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mapping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Mapping</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_tensors_to_save</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Stream</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#ChatGLMGenerationSession"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ChatGLMGenerationSession" title="Link to this definition">#</a></dt>
<dd><p>Bases: <a class="reference internal" href="#tensorrt_llm.runtime.GenerationSession" title="tensorrt_llm.runtime.generation.GenerationSession"><code class="xref py py-class docutils literal notranslate"><span class="pre">GenerationSession</span></code></a></p>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.EncDecModelRunner">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">EncDecModelRunner</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_name</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_dir</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_task_uids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_encoder</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Stream</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/enc_dec_model_runner.html#EncDecModelRunner"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.EncDecModelRunner" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.EncDecModelRunner.encoder_run">
<span class="sig-name descname"><span class="pre">encoder_run</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_lengths</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_input_length</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">position_ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">token_type_ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_embedding_table</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_tasks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_vocab_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">attention_mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">language_adapter_routings</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/enc_dec_model_runner.html#EncDecModelRunner.encoder_run"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.EncDecModelRunner.encoder_run" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.EncDecModelRunner.from_engine">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_engine</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_name</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_dir</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_task_uids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_encoder</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/enc_dec_model_runner.html#EncDecModelRunner.from_engine"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.EncDecModelRunner.from_engine" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.EncDecModelRunner.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">decoder_input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_beams</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pad_token_id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">eos_token_id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bos_token_id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_dict</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_embedding_table</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_tasks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_vocab_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">attention_mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">time_encoder</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_language_adapter_routings</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">decoder_language_adapter_routings</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/enc_dec_model_runner.html#EncDecModelRunner.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.EncDecModelRunner.generate" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.EncDecModelRunner.process_input">
<span class="sig-name descname"><span class="pre">process_input</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">remove_input_padding</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pad_token_id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_tasks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">language_adapter_routings</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/enc_dec_model_runner.html#EncDecModelRunner.process_input"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.EncDecModelRunner.process_input" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSequence">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">GenerationSequence</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seq_idx</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_idx</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#GenerationSequence"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSequence" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSequence.get_batch_idx">
<span class="sig-name descname"><span class="pre">get_batch_idx</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">int</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#GenerationSequence.get_batch_idx"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSequence.get_batch_idx" title="Link to this definition">#</a></dt>
<dd><p>Returns idx of sequence in batch</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSequence.get_seq_idx">
<span class="sig-name descname"><span class="pre">get_seq_idx</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">int</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#GenerationSequence.get_seq_idx"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSequence.get_seq_idx" title="Link to this definition">#</a></dt>
<dd><p>Returns sequence idx</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">GenerationSession</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">model_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.ModelConfig" title="tensorrt_llm.runtime.generation.ModelConfig"><span class="pre">ModelConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_buffer</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mapping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Mapping</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_tensors_to_save</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Stream</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.batch_size">
<span class="sig-name descname"><span class="pre">batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.batch_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.buffer_allocated">
<span class="sig-name descname"><span class="pre">buffer_allocated</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.buffer_allocated" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.context_mem_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">context_mem_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.context_mem_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.conv_kernel">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">conv_kernel</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.conv_kernel" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.cross_attention">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cross_attention</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.cross_attention" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.cuda_graph_mode">
<span class="sig-name descname"><span class="pre">cuda_graph_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.cuda_graph_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.cuda_stream_guard">
<span class="sig-name descname"><span class="pre">cuda_stream_guard</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.cuda_stream_guard"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.cuda_stream_guard" title="Link to this definition">#</a></dt>
<dd><p>Sync external stream and set current stream to the one bound to the session. Reset on exit.</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.debug_mode">
<span class="sig-name descname"><span class="pre">debug_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.debug_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.debug_tensors_to_save">
<span class="sig-name descname"><span class="pre">debug_tensors_to_save</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.debug_tensors_to_save" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.decode">
<span class="sig-name descname"><span class="pre">decode</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_embedding_table</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tasks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_vocab_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_words_list</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_words_list</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stopping_criteria</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><span class="pre">StoppingCriteria</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><span class="pre">LogitsProcessor</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_mask</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.decode"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.decode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.decode_batch">
<span class="sig-name descname"><span class="pre">decode_batch</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.decode_batch"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.decode_batch" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.decode_regular">
<span class="sig-name descname"><span class="pre">decode_regular</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">scfg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">host_context_lengths</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_context_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cache_indirections</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">hidden_states</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_embedding_table</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tasks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_vocab_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ite</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_limit_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_words_data</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_words_data</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stopping_criteria</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><span class="pre">StoppingCriteria</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><span class="pre">LogitsProcessor</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_mask</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.decode_regular"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.decode_regular" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.decode_stream">
<span class="sig-name descname"><span class="pre">decode_stream</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">scfg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">host_context_lengths</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_context_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cache_indirections</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">hidden_states</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_embedding_table</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tasks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_vocab_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ite</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_limit_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_words_data</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_words_data</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stopping_criteria</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><span class="pre">StoppingCriteria</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><span class="pre">LogitsProcessor</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_mask</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.decode_stream"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.decode_stream" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.device">
<span class="sig-name descname"><span class="pre">device</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">device</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.device" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.dtype">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dtype</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.dump_debug_buffers">
<span class="sig-name descname"><span class="pre">dump_debug_buffers</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">step</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.dump_debug_buffers"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.dump_debug_buffers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.early_stop_criteria">
<span class="sig-name descname"><span class="pre">early_stop_criteria</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">step</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">should_stop</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.early_stop_criteria"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.early_stop_criteria" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.engine_inspector">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">engine_inspector</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.engine_inspector" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.filter_medusa_logits">
<span class="sig-name descname"><span class="pre">filter_medusa_logits</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">best_path</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">best_path_lengths</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_logits</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.filter_medusa_logits"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.filter_medusa_logits" title="Link to this definition">#</a></dt>
<dd><p>medusa_logits is of shape [nMH, bs, nMT+1, vocab]</p>
<blockquote>
<div><p>Returns [nMH, bs, vocab]</p>
</div></blockquote>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.finalize_decoder">
<span class="sig-name descname"><span class="pre">finalize_decoder</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">context_lengths</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">scfg</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">in_progress</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.finalize_decoder"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.finalize_decoder" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.find_best_medusa_path">
<span class="sig-name descname"><span class="pre">find_best_medusa_path</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">next_logits</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">temp</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.find_best_medusa_path"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.find_best_medusa_path" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.first_layer">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">first_layer</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.first_layer" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.gather_context_logits">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gather_context_logits</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.gather_context_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.gather_generation_logits">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.gather_generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.gemm_allreduce_plugin">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gemm_allreduce_plugin</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.gemm_allreduce_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.get_next_medusa_tokens">
<span class="sig-name descname"><span class="pre">get_next_medusa_tokens</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">next_medusa_logits</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.get_next_medusa_tokens"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.get_next_medusa_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.get_num_heads_kv">
<span class="sig-name descname"><span class="pre">get_num_heads_kv</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">layer_idx</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">int</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.get_num_heads_kv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.get_num_heads_kv" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.handle_per_step">
<span class="sig-name descname"><span class="pre">handle_per_step</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cache_indirections</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">step</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_context_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">hidden_states</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">scfg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_block_offsets</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">host_kv_cache_block_offsets</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_kv_cache_block_offsets</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">host_cross_kv_cache_block_offsets</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_embedding_table</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tasks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">host_context_lengths</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">attention_mask</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_mask_for_context</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_mask_for_gen</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_vocab_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ite</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_limit_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">next_step_tensors</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">RuntimeTensor</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_words_data</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_words_data</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stopping_criteria</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><span class="pre">StoppingCriteria</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><span class="pre">LogitsProcessor</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.handle_per_step"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.handle_per_step" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.has_position_embedding">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">has_position_embedding</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.has_position_embedding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.has_token_type_embedding">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">has_token_type_embedding</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.has_token_type_embedding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.head_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">head_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.head_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.hidden_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">hidden_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.hidden_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.is_medusa_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">is_medusa_mode</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.is_medusa_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.is_redrafter_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">is_redrafter_mode</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.is_redrafter_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.kv_cache_type">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">kv_cache_type</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.kv_cache_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.last_layer">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">last_layer</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.last_layer" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.locate_accepted_draft_tokens">
<span class="sig-name descname"><span class="pre">locate_accepted_draft_tokens</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">best_path</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">best_path_len</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">draft_paths</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.locate_accepted_draft_tokens"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.locate_accepted_draft_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.mapping">
<span class="sig-name descname"><span class="pre">mapping</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Mapping</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.mapping" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.max_draft_tokens">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_draft_tokens</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.max_draft_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.max_prompt_embedding_table_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.max_prompt_embedding_table_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.medusa_decode_and_verify">
<span class="sig-name descname"><span class="pre">medusa_decode_and_verify</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">step</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logits</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.medusa_decode_and_verify"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.medusa_decode_and_verify" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.medusa_paths">
<span class="sig-name descname"><span class="pre">medusa_paths</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.medusa_paths" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.medusa_position_offsets">
<span class="sig-name descname"><span class="pre">medusa_position_offsets</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.medusa_position_offsets" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.medusa_temperature">
<span class="sig-name descname"><span class="pre">medusa_temperature</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.medusa_temperature" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.medusa_topks">
<span class="sig-name descname"><span class="pre">medusa_topks</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.medusa_topks" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.medusa_tree_ids">
<span class="sig-name descname"><span class="pre">medusa_tree_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.medusa_tree_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.next_medusa_input_ids">
<span class="sig-name descname"><span class="pre">next_medusa_input_ids</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.next_medusa_input_ids"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.next_medusa_input_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.num_draft_tokens">
<span class="sig-name descname"><span class="pre">num_draft_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.num_draft_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.num_heads">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_heads</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.num_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.num_layers">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_layers</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.num_layers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.num_medusa_heads">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_medusa_heads</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.num_medusa_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.paged_kv_cache">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">paged_kv_cache</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.paged_kv_cache" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.paged_state">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">paged_state</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.paged_state" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.pp_communicate_final_output_ids">
<span class="sig-name descname"><span class="pre">pp_communicate_final_output_ids</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">final_output_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.pp_communicate_final_output_ids"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.pp_communicate_final_output_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.pp_communicate_new_tokens">
<span class="sig-name descname"><span class="pre">pp_communicate_new_tokens</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">should_stop</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cache_indir</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence_length</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.pp_communicate_new_tokens"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.pp_communicate_new_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.process_logits_including_draft">
<span class="sig-name descname"><span class="pre">process_logits_including_draft</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">step</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">next_step_buffer</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.process_logits_including_draft"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.process_logits_including_draft" title="Link to this definition">#</a></dt>
<dd><ol class="arabic simple">
<li><p>Process logits to tokens and validate (Medusa) or process outputs (ReDrafter)</p></li>
<li><p>Extract early stop criteria here : self.accept_length</p></li>
<li><p>Update output ids : needs self.new_tokens and past_sequence_length</p></li>
<li><p>Get next input_ids : self.[new_tokens, accept_lengths, medusa_output_tokens]</p></li>
<li><p>Update KV cache : self.[sequence_length, num_draft_tokens]</p></li>
<li><p>Update sequence_length_buffer and past_kv_length</p></li>
</ol>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.profiler">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">profiler</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.profiler" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.quant_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">quant_mode</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.quant_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.remove_input_padding">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">remove_input_padding</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.remove_input_padding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.reorder_kv_cache_for_beam_search">
<span class="sig-name descname"><span class="pre">reorder_kv_cache_for_beam_search</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_context_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">step</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.reorder_kv_cache_for_beam_search"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.reorder_kv_cache_for_beam_search" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.rnn_conv_dim_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">rnn_conv_dim_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.rnn_conv_dim_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.rnn_head_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">rnn_head_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.rnn_head_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.rnn_hidden_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">rnn_hidden_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.rnn_hidden_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.runtime">
<span class="sig-name descname"><span class="pre">runtime</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">_Runtime</span></em><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.runtime" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.setup">
<span class="sig-name descname"><span class="pre">setup</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_context_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_attention_window_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sink_token_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_max_input_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_manager</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoraManager</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_uids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">multi_block_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.setup"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.setup" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.state_dtype">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">state_dtype</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.state_dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.state_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">state_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.state_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.tokens_per_block">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">tokens_per_block</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.tokens_per_block" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.update_output_ids_by_offset">
<span class="sig-name descname"><span class="pre">update_output_ids_by_offset</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">new_generated_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">offsets</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#GenerationSession.update_output_ids_by_offset"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.update_output_ids_by_offset" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.use_gemm_allreduce_plugin">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_gemm_allreduce_plugin</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.use_gemm_allreduce_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.use_gpt_attention_plugin">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_gpt_attention_plugin</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.use_gpt_attention_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.use_kv_cache">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_kv_cache</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.use_kv_cache" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.use_lora_plugin">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_lora_plugin</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.use_lora_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.use_mamba_conv1d_plugin">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_mamba_conv1d_plugin</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.use_mamba_conv1d_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.GenerationSession.vocab_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">vocab_size</span></span><a class="headerlink" href="#tensorrt_llm.runtime.GenerationSession.vocab_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.KVCacheManager">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">KVCacheManager</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_layers</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_blocks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">block_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokens_per_block</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_blocks_per_seq</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_attention_window_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sink_token_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_one_more_block</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#KVCacheManager"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.KVCacheManager" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.KVCacheManager.add_sequence">
<span class="sig-name descname"><span class="pre">add_sequence</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">sequence</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.GenerationSequence" title="tensorrt_llm.runtime.kv_cache_manager.GenerationSequence"><span class="pre">GenerationSequence</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">always_share_across_beam</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#KVCacheManager.add_sequence"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.KVCacheManager.add_sequence" title="Link to this definition">#</a></dt>
<dd><p>Add sequence to the manager and allocate minimum amount of blocks for context</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.KVCacheManager.get_block_offsets">
<span class="sig-name descname"><span class="pre">get_block_offsets</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Tensor</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#KVCacheManager.get_block_offsets"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.KVCacheManager.get_block_offsets" title="Link to this definition">#</a></dt>
<dd><p>Returns array of offsets into memory pools</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.KVCacheManager.step">
<span class="sig-name descname"><span class="pre">step</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">finished</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">bool</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/kv_cache_manager.html#KVCacheManager.step"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.KVCacheManager.step" title="Link to this definition">#</a></dt>
<dd><p>Iterate to the next generation step.
Add new blocks where needed and clear finished sequences.</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.LogitsProcessor">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">LogitsProcessor</span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#LogitsProcessor"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.LogitsProcessor" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Base class for all logit processors that can be applied during generation.</p>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.LogitsProcessorList">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">LogitsProcessorList</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">iterable</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">()</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">/</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#LogitsProcessorList"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.LogitsProcessorList" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">list</span></code>, <a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><code class="xref py py-class docutils literal notranslate"><span class="pre">LogitsProcessor</span></code></a></p>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">ModelConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">vocab_size:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_layers:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_heads:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_kv_heads:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">hidden_size:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gpt_attention_plugin:</span> <span class="pre">bool</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gemm_allreduce_plugin:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">remove_input_padding:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">model_name:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">''</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">tensorrt_llm.bindings.KVCacheType</span> <span class="pre">=</span> <span class="pre">&lt;KVCacheType.CONTINUOUS:</span> <span class="pre">0&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">head_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">has_position_embedding:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">has_token_type_embedding:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokens_per_block:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">32</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">quant_mode:</span> <span class="pre">tensorrt_llm.quantization.mode.QuantMode</span> <span class="pre">=</span> <span class="pre">&lt;QuantMode:</span> <span class="pre">0&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dtype:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">''</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_plugin:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_target_modules:</span> <span class="pre">List[str]</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">trtllm_modules_to_hf_modules:</span> <span class="pre">dict</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_cross_kv:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_medusa_heads:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_medusa_tokens:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">paged_state:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mamba_conv1d_plugin:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">conv_kernel:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">layer_types:</span> <span class="pre">List[str]</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">rnn_hidden_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">rnn_head_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">rnn_conv_dim_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">state_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">state_dtype:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">''</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gpu_weights_percent:</span> <span class="pre">float</span> <span class="pre">=</span> <span class="pre">1.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">redrafter_num_beams:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">redrafter_draft_len_per_beam:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_kv_heads_per_layer:</span> <span class="pre">Optional[List[int]]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_kv_heads_per_cross_attn_layer:</span> <span class="pre">Optional[List[int]]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_cross_attn_blocks:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">language_adapter_config:</span> <span class="pre">Optional[tensorrt_llm.layers.language_adapter.LanguageAdapterConfig]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#ModelConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.conv_kernel">
<span class="sig-name descname"><span class="pre">conv_kernel</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.conv_kernel" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.cross_attention">
<span class="sig-name descname"><span class="pre">cross_attention</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.cross_attention" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.dtype">
<span class="sig-name descname"><span class="pre">dtype</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">''</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.gather_context_logits">
<span class="sig-name descname"><span class="pre">gather_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.gather_context_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.gather_generation_logits">
<span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.gather_generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.gemm_allreduce_plugin">
<span class="sig-name descname"><span class="pre">gemm_allreduce_plugin</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.gemm_allreduce_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.gpt_attention_plugin">
<span class="sig-name descname"><span class="pre">gpt_attention_plugin</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.gpt_attention_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.gpu_weights_percent">
<span class="sig-name descname"><span class="pre">gpu_weights_percent</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.gpu_weights_percent" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.has_position_embedding">
<span class="sig-name descname"><span class="pre">has_position_embedding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.has_position_embedding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.has_token_type_embedding">
<span class="sig-name descname"><span class="pre">has_token_type_embedding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.has_token_type_embedding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.head_size">
<span class="sig-name descname"><span class="pre">head_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.head_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.hidden_size">
<span class="sig-name descname"><span class="pre">hidden_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.hidden_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.kv_cache_type">
<span class="sig-name descname"><span class="pre">kv_cache_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">KVCacheType</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">&lt;KVCacheType.CONTINUOUS:</span> <span class="pre">0&gt;</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.kv_cache_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.language_adapter_config">
<span class="sig-name descname"><span class="pre">language_adapter_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">LanguageAdapterConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.language_adapter_config" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.layer_types">
<span class="sig-name descname"><span class="pre">layer_types</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.layer_types" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.lora_plugin">
<span class="sig-name descname"><span class="pre">lora_plugin</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.lora_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.lora_target_modules">
<span class="sig-name descname"><span class="pre">lora_target_modules</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.lora_target_modules" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.mamba_conv1d_plugin">
<span class="sig-name descname"><span class="pre">mamba_conv1d_plugin</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.mamba_conv1d_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.max_batch_size">
<span class="sig-name descname"><span class="pre">max_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.max_batch_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.max_beam_width">
<span class="sig-name descname"><span class="pre">max_beam_width</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.max_beam_width" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.max_medusa_tokens">
<span class="sig-name descname"><span class="pre">max_medusa_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.max_medusa_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.max_prompt_embedding_table_size">
<span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.max_prompt_embedding_table_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.model_name">
<span class="sig-name descname"><span class="pre">model_name</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">''</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.model_name" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.num_heads">
<span class="sig-name descname"><span class="pre">num_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.num_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.num_kv_heads">
<span class="sig-name descname"><span class="pre">num_kv_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.num_kv_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_cross_attn_layer">
<span class="sig-name descname"><span class="pre">num_kv_heads_per_cross_attn_layer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_cross_attn_layer" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_layer">
<span class="sig-name descname"><span class="pre">num_kv_heads_per_layer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_layer" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.num_layers">
<span class="sig-name descname"><span class="pre">num_layers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.num_layers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.num_medusa_heads">
<span class="sig-name descname"><span class="pre">num_medusa_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.num_medusa_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.paged_state">
<span class="sig-name descname"><span class="pre">paged_state</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.paged_state" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.quant_mode">
<span class="sig-name descname"><span class="pre">quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantMode" title="tensorrt_llm.quantization.mode.QuantMode"><span class="pre">QuantMode</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.quant_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.redrafter_draft_len_per_beam">
<span class="sig-name descname"><span class="pre">redrafter_draft_len_per_beam</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.redrafter_draft_len_per_beam" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.redrafter_num_beams">
<span class="sig-name descname"><span class="pre">redrafter_num_beams</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.redrafter_num_beams" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.remove_input_padding">
<span class="sig-name descname"><span class="pre">remove_input_padding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.remove_input_padding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.rnn_conv_dim_size">
<span class="sig-name descname"><span class="pre">rnn_conv_dim_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.rnn_conv_dim_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.rnn_head_size">
<span class="sig-name descname"><span class="pre">rnn_head_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.rnn_head_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.rnn_hidden_size">
<span class="sig-name descname"><span class="pre">rnn_hidden_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.rnn_hidden_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.skip_cross_attn_blocks">
<span class="sig-name descname"><span class="pre">skip_cross_attn_blocks</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.skip_cross_attn_blocks" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.skip_cross_kv">
<span class="sig-name descname"><span class="pre">skip_cross_kv</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.skip_cross_kv" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.state_dtype">
<span class="sig-name descname"><span class="pre">state_dtype</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">''</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.state_dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.state_size">
<span class="sig-name descname"><span class="pre">state_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.state_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.tokens_per_block">
<span class="sig-name descname"><span class="pre">tokens_per_block</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">32</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.tokens_per_block" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.trtllm_modules_to_hf_modules">
<span class="sig-name descname"><span class="pre">trtllm_modules_to_hf_modules</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">dict</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.trtllm_modules_to_hf_modules" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelConfig.vocab_size">
<span class="sig-name descname"><span class="pre">vocab_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelConfig.vocab_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">ModelRunner</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">session</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.GenerationSession" title="tensorrt_llm.runtime.generation.GenerationSession"><span class="pre">GenerationSession</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_seq_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">KVCacheType</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_manager</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoraManager</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner.html#ModelRunner"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">ModelRunnerMixin</span></code></p>
<p>An interface class that wraps GenerationSession and provides generation methods.</p>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.dtype">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dtype</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">dtype</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.from_dir">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dir</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_output_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_ckpt_source</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'hf'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Stream</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gpu_weights_percent</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">multi_block_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.runtime.ModelRunner" title="tensorrt_llm.runtime.model_runner.ModelRunner"><span class="pre">ModelRunner</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner.html#ModelRunner.from_dir"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.from_dir" title="Link to this definition">#</a></dt>
<dd><p>Create a ModelRunner instance from an engine directory.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>engine_dir</strong> (<em>str</em>) The directory that contains the serialized engine files and config files.</p></li>
<li><p><strong>max_output_len</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) max_output_len, this arg might be available only when loading time, generate will still to check when disable_kv_cache is enabled.</p></li>
<li><p><strong>lora_dir</strong> (<em>Optional</em><em>[</em><em>List</em><em>[</em><em>str</em><em>]</em><em>]</em>) The directories that contain LoRA weights.</p></li>
<li><p><strong>rank</strong> (<em>int</em>) The runtime rank id.</p></li>
<li><p><strong>debug_mode</strong> (<em>bool</em>) Whether or not to turn on the debug mode.</p></li>
<li><p><strong>medusa_choices</strong> (<em>List</em><em>[</em><em>List</em><em>[</em><em>int</em><em>]</em><em>]</em>) Medusa choices to use when in Medusa decoding</p></li>
<li><p><strong>stream</strong> (<em>torch.cuda.Stream</em>) Stream to use.</p></li>
<li><p><strong>multi_block_mode</strong> (<em>bool</em>) Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>An instance of ModelRunner.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.runtime.ModelRunner" title="tensorrt_llm.runtime.ModelRunner">ModelRunner</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.from_engine">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_engine</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">engine</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Engine</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_output_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_ckpt_source</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Stream</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gpu_weights_percent</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">multi_block_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.runtime.ModelRunner" title="tensorrt_llm.runtime.model_runner.ModelRunner"><span class="pre">ModelRunner</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner.html#ModelRunner.from_engine"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.from_engine" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.gather_context_logits">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gather_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.gather_context_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.gather_generation_logits">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.gather_generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">position_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_table</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_tasks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_uids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stopping_criteria</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><span class="pre">StoppingCriteria</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><span class="pre">LogitsProcessor</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_max_input_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_features</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_output_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_masks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner.html#ModelRunner.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.generate" title="Link to this definition">#</a></dt>
<dd><p>Generates sequences of token ids.
The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed.
You can override any sampling_configs attributes by passing corresponding parameters.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>batch_input_ids</strong> (<em>List</em><em>[</em><em>torch.Tensor</em><em>]</em>) A list of input id tensors. Each tensor is of shape (sequence_length, ).</p></li>
<li><p><strong>sampling_config</strong> (<a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.SamplingConfig"><em>SamplingConfig</em></a>) The sampling configuration to be used as base parametrization for the generation call.
The passed <a href="#id1"><span class="problematic" id="id2">**</span></a>kwargs matching the sampling_configs attributes will override them.
If the sampling_config is not provided, a default will be used.</p></li>
<li><p><strong>prompt_table</strong> (<em>str</em><em> or </em><em>torch.Tensor</em>) The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.</p></li>
<li><p><strong>prompt_tasks</strong> (<em>str</em>) The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).</p></li>
<li><p><strong>lora_uids</strong> (<em>list</em>) The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.</p></li>
<li><p><strong>streaming</strong> (<em>bool</em>) Whether or not to use streaming mode for generation.</p></li>
<li><p><strong>stopping_criteria</strong> (<a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.StoppingCriteria"><em>StoppingCriteria</em></a>) Custom stopping criteria.</p></li>
<li><p><strong>logits_processor</strong> (<a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.LogitsProcessor"><em>LogitsProcessor</em></a>) Custom logits processors.</p></li>
<li><p><strong>medusa_choices</strong> (<em>List</em><em>[</em><em>List</em><em>[</em><em>int</em><em>]</em><em>]</em>) Medusa decoding choices.</p></li>
<li><p><strong>(</strong><strong>Dict</strong><strong>[</strong><strong>str</strong> (<em>kwargs</em>) Ad hoc parametrization of sampling_config.
The passed <a href="#id3"><span class="problematic" id="id4">**</span></a>kwargs matching the sampling_configs attributes will override them.</p></li>
<li><p><strong>Any</strong><strong>]</strong> Ad hoc parametrization of sampling_config.
The passed <a href="#id5"><span class="problematic" id="id6">**</span></a>kwargs matching the sampling_configs attributes will override them.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>If return_dict=False, the method returns generated output_ids.
If return_dict=True, the method returns a dict of output_ids,
sequence_lengths (if sampling_config.output_sequence_lengths=True),
context_logits and generation_logits (if self.gather_context_logits=True
and self.gather_generation_logits=True, respectively).</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>torch.Tensor or dict</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.hidden_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">hidden_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.hidden_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.mapping">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mapping</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Mapping</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.mapping" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.max_prompt_embedding_table_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.max_prompt_embedding_table_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.max_sequence_length">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_sequence_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.max_sequence_length" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.num_heads">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.num_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.num_layers">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_layers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.num_layers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.remove_input_padding">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">remove_input_padding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.remove_input_padding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.serialize_engine">
<span class="sig-name descname"><span class="pre">serialize_engine</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">IHostMemory</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner.html#ModelRunner.serialize_engine"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.serialize_engine" title="Link to this definition">#</a></dt>
<dd><p>Serialize the engine.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>The serialized engine.</p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p>bytes</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.use_lora_plugin">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_lora_plugin</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.use_lora_plugin" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.vocab_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">vocab_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.vocab_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunner.vocab_size_padded">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">vocab_size_padded</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunner.vocab_size_padded" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">ModelRunnerCpp</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">executor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Executor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_seq_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">model_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ModelConfig</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">world_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">WorldConfig</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_kv_cache</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_manager</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoraManager</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner_cpp.html#ModelRunnerCpp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">ModelRunnerMixin</span></code></p>
<p>An interface class that wraps Executor and provides generation methods.</p>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.dtype">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dtype</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">dtype</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.from_dir">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dir</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_output_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_attention_window_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sink_token_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_free_gpu_memory_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_kv_cache_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">eagle_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">eagle_posterior_threshold</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">eagle_use_dynamic_tree</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">eagle_dynamic_tree_max_top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lookahead_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_ckpt_source</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'hf'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_gpu_direct_storage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gpu_weights_percent</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_tokens_in_paged_kv_cache</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_enable_block_reuse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_chunked_context</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">is_enc_dec</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">multi_block_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor_map</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.generation.LogitsProcessor"><span class="pre">LogitsProcessor</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">device_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">is_orchestrator_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_runtime_defaults</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_variable_beam_width_search</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mm_embedding_offloading</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.runtime.ModelRunnerCpp" title="tensorrt_llm.runtime.model_runner_cpp.ModelRunnerCpp"><span class="pre">ModelRunnerCpp</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner_cpp.html#ModelRunnerCpp.from_dir"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.from_dir" title="Link to this definition">#</a></dt>
<dd><p>Create a ModelRunnerCpp instance from an engine directory.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>engine_dir</strong> (<em>str</em>) The directory that contains the serialized engine files and config files.</p></li>
<li><p><strong>lora_dir</strong> (<em>str</em>) The directory that contains LoRA weights.</p></li>
<li><p><strong>rank</strong> (<em>int</em>) The runtime rank id.</p></li>
<li><p><strong>max_batch_size</strong> (<em>int</em>) The runtime batch size limit. If max_batch_size is not None, it should not
be larger than the engines max_batch_size; otherwise, the engines max_batch_size
will be used.</p></li>
<li><p><strong>max_input_len</strong> (<em>int</em>) The runtime input length limit. If max_input_len is not None, it should not
be larger than the engines max_input_len; otherwise, the engines max_input_len
will be used.</p></li>
<li><p><strong>max_output_len</strong> (<em>int</em>) The runtime output length limit. If max_output_len is not None, it should not
be larger than the engines max_output_len; otherwise, the engines max_output_len
will be used.</p></li>
<li><p><strong>max_beam_width</strong> (<em>int</em>) The runtime beam width limit. If max_beam_width is not None, it should not
be larger than the engines max_beam_width; otherwise, the engines max_beam_width
will be used.</p></li>
<li><p><strong>max_attention_window_size</strong> (<em>List</em><em>[</em><em>int</em><em>]</em>) The attention window size that controls the sliding window attention / cyclic kv cache behavior.</p></li>
<li><p><strong>sink_token_length</strong> (<em>int</em>) The sink token length, default=0.</p></li>
<li><p><strong>kv_cache_free_gpu_memory_fraction</strong> (<em>float</em>) Free GPU memory fraction that KV cache used.</p></li>
<li><p><strong>cross_kv_cache_fraction</strong> (<em>float</em>) KV Cache fraction reserved for cross attention, should only be used with enc-dec models.</p></li>
<li><p><strong>debug_mode</strong> (<em>bool</em>) Whether or not to turn on the debug mode.</p></li>
<li><p><strong>medusa_choices</strong> (<em>List</em><em>[</em><em>List</em><em>[</em><em>int</em><em>]</em><em>]</em>) Medusa choices to use when in Medusa decoding.</p></li>
<li><p><strong>eagle_choices</strong> (<em>List</em><em>[</em><em>List</em><em>[</em><em>int</em><em>]</em><em>]</em>) Eagle choices to use when in Eagle-1 decoding.</p></li>
<li><p><strong>float</strong> (<em>eagle_posterior_threshold</em>) Minimum token probability threshold for typical acceptance.
Value different from None enables typical acceptance in Eagle.</p></li>
<li><p><strong>bool</strong> (<em>eagle_use_dynamic_tree</em>) Whether to use Eagle-2, which is dynamic tree.</p></li>
<li><p><strong>int</strong> (<em>eagle_dynamic_tree_max_top_k</em>) The maximum number of draft tokens to expand for each node in Eagle-2.</p></li>
<li><p><strong>lora_ckpt_source</strong> (<em>str</em>) Source of checkpoint. Should be one of [hf, nemo].</p></li>
<li><p><strong>max_tokens_in_paged_kv_cache</strong> (<em>int</em>) Maximum amount of tokens configured in kv cache.</p></li>
<li><p><strong>kv_cache_enable_block_reuse</strong> (<em>bool</em>) Enables block reuse in kv cache.</p></li>
<li><p><strong>enable_chunked_context</strong> (<em>bool</em>) Enables chunked context.</p></li>
<li><p><strong>is_enc_dec</strong> (<em>bool</em>) Whether the model is encoder-decoder architecture.</p></li>
<li><p><strong>multi_block_mode</strong> (<em>bool</em>) Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.</p></li>
<li><p><strong>enable_context_fmha_fp32_acc</strong> (<em>bool</em>) Enable FMHA runner FP32 accumulation.</p></li>
<li><p><strong>cuda_graph_mode</strong> (<em>bool</em>) Whether to use cuda graph for inference.</p></li>
<li><p><strong>logits_processor_map</strong> (<em>Dict</em><em>[</em><em>str</em><em>, </em><a class="reference internal" href="#tensorrt_llm.runtime.LogitsProcessor" title="tensorrt_llm.runtime.LogitsProcessor"><em>LogitsProcessor</em></a><em>]</em>) A map of logits processor functions indexed by names. A name can be provided later to
the generate() function to specify which logits processor to run.</p></li>
<li><p><strong>device_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em>) Device indices to run the Executor on.</p></li>
<li><p><strong>is_orchestrator_mode</strong> (<em>bool</em>) The mode to run the model-runner, Leader mode by default.</p></li>
<li><p><strong>gather_generation_logits</strong> (<em>bool</em>) Enable gathering generation logits.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>An instance of ModelRunnerCpp.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.runtime.ModelRunnerCpp" title="tensorrt_llm.runtime.ModelRunnerCpp">ModelRunnerCpp</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.gather_context_logits">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gather_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.gather_context_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.gather_generation_logits">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.gather_generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">position_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_input_features</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">encoder_output_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_attention_masks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mrope_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="tensorrt_llm.layers.html#tensorrt_llm.layers.attention.MropeParams" title="tensorrt_llm.layers.attention.MropeParams"><span class="pre">MropeParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_uids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lookahead_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stopping_criteria</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><span class="pre">StoppingCriteria</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor_names</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_words_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_words_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_cum_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_table</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_tasks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_token_extra_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_all_generated_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">language_adapter_uids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mm_embedding_offloading</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/model_runner_cpp.html#ModelRunnerCpp.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.generate" title="Link to this definition">#</a></dt>
<dd><p>Generates sequences of token ids.
The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed.
You can override any sampling_configs attributes by passing corresponding parameters.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>batch_input_ids</strong> (<em>List</em><em>[</em><em>torch.Tensor</em><em>]</em>) A list of input id tensors. Each tensor is of shape (sequence_length, ).</p></li>
<li><p><strong>position_ids</strong> (<em>List</em><em>[</em><em>torch.Tensor</em><em>]</em>) A list of position id tensors. Each tensor is of shape (sequence_length, ).</p></li>
<li><p><strong>encoder_input_ids</strong> (<em>List</em><em>[</em><em>torch.Tensor</em><em>]</em>) A list of encoder input id tensors for encoder-decoder models (optional). Each tensor is of shape (sequence_length, ).</p></li>
<li><p><strong>encoder_input_features</strong> (List[torch.Tensor]):
A list of encoder input feature tensors for multimodal encoder-decoder models (optional). Each tensor is of shape (sequence_length, feature_dim).</p></li>
<li><p><strong>encoder_output_lengths</strong> (List[int]):
A list of encoder output lengths (optional) if encoder output has different length from encoder input (due to convolution down-sampling, etc.)</p></li>
<li><p><strong>sampling_config</strong> (<a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.SamplingConfig"><em>SamplingConfig</em></a>) The sampling configuration to be used as base parametrization for the generation call.
The passed <a href="#id7"><span class="problematic" id="id8">**</span></a>kwargs matching the sampling_configs attributes will override them.
If the sampling_config is not provided, a default will be used.</p></li>
<li><p><strong>prompt_table</strong> (<em>str</em><em> or </em><em>torch.Tensor</em>) The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.</p></li>
<li><p><strong>prompt_tasks</strong> (<em>str</em>) The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).</p></li>
<li><p><strong>input_token_extra_ids</strong> (<em>List</em><em>[</em><em>List</em><em>[</em><em>int</em><em>]</em><em>]</em>) Input token extra ids for using p-tuning and KV Cache reuse together</p></li>
<li><p><strong>lora_uids</strong> (<em>list</em>) The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.</p></li>
<li><p><strong>streaming</strong> (<em>bool</em>) Whether or not to use streaming mode for generation.</p></li>
<li><p><strong>stopping_criteria</strong> (<a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.StoppingCriteria"><em>StoppingCriteria</em></a>) Custom stopping criteria.</p></li>
<li><p><strong>logits_processor_names</strong> (<em>List</em><em>[</em><em>str</em><em>]</em>) Custom logits processor names.</p></li>
<li><p><strong>return_all_generated_tokens</strong> (<em>bool</em>) Whether the full output is returned at each streaming step</p></li>
<li><p><strong>(</strong><strong>Dict</strong><strong>[</strong><strong>str</strong> (<em>kwargs</em>) Ad hoc parametrization of sampling_config.
The passed <a href="#id9"><span class="problematic" id="id10">**</span></a>kwargs matching the sampling_configs attributes will override them.</p></li>
<li><p><strong>Any</strong><strong>]</strong> Ad hoc parametrization of sampling_config.
The passed <a href="#id11"><span class="problematic" id="id12">**</span></a>kwargs matching the sampling_configs attributes will override them.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>If return_dict=False, the method returns generated output_ids.
If return_dict=True, the method returns a dict of output_ids,
sequence_lengths (if sampling_config.output_sequence_lengths=True),
context_logits and generation_logits (if self.gather_context_logits=True and
self.gather_generation_logits=True, respectively).</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>torch.Tensor or dict</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.hidden_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">hidden_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.hidden_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.max_prompt_embedding_table_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.max_prompt_embedding_table_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.max_sequence_length">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_sequence_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.max_sequence_length" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.num_heads">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.num_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.num_layers">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_layers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.num_layers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.remove_input_padding">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">remove_input_padding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.remove_input_padding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.vocab_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">vocab_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.vocab_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.ModelRunnerCpp.vocab_size_padded">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">vocab_size_padded</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.ModelRunnerCpp.vocab_size_padded" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">MultimodalModelRunner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">args</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.audio_engine_dir">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">audio_engine_dir</span></span><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.audio_engine_dir" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.cpp_e2e">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cpp_e2e</span></span><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.cpp_e2e" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.cpp_llm_only">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cpp_llm_only</span></span><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.cpp_llm_only" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">post_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">image</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">decoder_input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">other_vision_inputs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">{}</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">other_audio_inputs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">{}</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">other_decoder_inputs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">{}</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.generate" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.get_audio_features">
<span class="sig-name descname"><span class="pre">get_audio_features</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">audio</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other_audio_inputs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.get_audio_features"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.get_audio_features" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.get_rope_index">
<span class="sig-name descname"><span class="pre">get_rope_index</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LongTensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">image_grid_thw</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LongTensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">video_grid_thw</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LongTensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">attention_mask</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">Tensor</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.get_rope_index"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.get_rope_index" title="Link to this definition">#</a></dt>
<dd><p>Calculate the 3D rope index based on image and videos temporal, height and width in LLM.</p>
<dl>
<dt>Explanation:</dt><dd><p>Each embedding sequence contains vision embedding and text embedding or just contains text embedding.</p>
<p>For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
Examples:</p>
<blockquote>
<div><p>input_ids: [T T T T T], here T is for text.
temporal position_ids: [0, 1, 2, 3, 4]
height position_ids: [0, 1, 2, 3, 4]
width position_ids: [0, 1, 2, 3, 4]</p>
</div></blockquote>
<p>For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
and 1D rotary position embedding for text part.
Examples:</p>
<blockquote>
<div><p>Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
text temporal position_ids: [3, 4, 5, 6, 7]
text height position_ids: [3, 4, 5, 6, 7]
text width position_ids: [3, 4, 5, 6, 7]
Here we calculate the text start position_ids as the max vision position_ids plus 1.</p>
</div></blockquote>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input_ids</strong> (<cite>torch.LongTensor</cite> of shape <cite>(batch_size, sequence_length)</cite>) Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.</p></li>
<li><p><strong>image_grid_thw</strong> (<cite>torch.LongTensor</cite> of shape <cite>(num_images, 3)</cite>, <em>optional</em>) The temporal, height and width of feature shape of each image in LLM.</p></li>
<li><p><strong>video_grid_thw</strong> (<cite>torch.LongTensor</cite> of shape <cite>(num_videos, 3)</cite>, <em>optional</em>) The temporal, height and width of feature shape of each video in LLM.</p></li>
<li><p><strong>attention_mask</strong> (<cite>torch.Tensor</cite> of shape <cite>(batch_size, sequence_length)</cite>, <em>optional</em>) <p>Mask to avoid performing attention on padding token indices. Mask values selected in <cite>[0, 1]</cite>:</p>
<ul>
<li><p>1 for tokens that are <strong>not masked</strong>,</p></li>
<li><p>0 for tokens that are <strong>masked</strong>.</p></li>
</ul>
</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>position_ids (<cite>torch.LongTensor</cite> of shape <cite>(3, batch_size, sequence_length)</cite>)
mrope_position_deltas (<cite>torch.Tensor</cite> of shape <cite>(batch_size)</cite>)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.get_visual_features">
<span class="sig-name descname"><span class="pre">get_visual_features</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">image</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other_vision_inputs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.get_visual_features"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.get_visual_features" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.init_audio_encoder">
<span class="sig-name descname"><span class="pre">init_audio_encoder</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.init_audio_encoder"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_audio_encoder" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.init_image_encoder">
<span class="sig-name descname"><span class="pre">init_image_encoder</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.init_image_encoder"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_image_encoder" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.init_llm">
<span class="sig-name descname"><span class="pre">init_llm</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.init_llm"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_llm" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.init_processor">
<span class="sig-name descname"><span class="pre">init_processor</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.init_processor"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_processor" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.init_tokenizer">
<span class="sig-name descname"><span class="pre">init_tokenizer</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.init_tokenizer"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_tokenizer" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.llm_engine_dir">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">llm_engine_dir</span></span><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.llm_engine_dir" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.load_test_audio">
<span class="sig-name descname"><span class="pre">load_test_audio</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">audio_path</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.load_test_audio"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.load_test_audio" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.load_test_data">
<span class="sig-name descname"><span class="pre">load_test_data</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">image_path</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">video_path</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.load_test_data"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.load_test_data" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.prepare_position_ids_for_cogvlm">
<span class="sig-name descname"><span class="pre">prepare_position_ids_for_cogvlm</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.prepare_position_ids_for_cogvlm"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.prepare_position_ids_for_cogvlm" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.preprocess">
<span class="sig-name descname"><span class="pre">preprocess</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">post_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">image</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">other_vision_inputs</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">other_audio_inputs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.preprocess"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.preprocess" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup">
<span class="sig-name descname"><span class="pre">ptuning_setup</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_table</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_lengths</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.ptuning_setup"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_fuyu">
<span class="sig-name descname"><span class="pre">ptuning_setup_fuyu</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">image_patches_indices</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.ptuning_setup_fuyu"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_fuyu" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_llava_next">
<span class="sig-name descname"><span class="pre">ptuning_setup_llava_next</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">visual_features</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">post_prompt</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.ptuning_setup_llava_next"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_llava_next" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_phi3">
<span class="sig-name descname"><span class="pre">ptuning_setup_phi3</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">visual_features</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">audio_features</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_img_tokens</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_aud_tokens</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.ptuning_setup_phi3"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_phi3" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.python_e2e">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">python_e2e</span></span><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.python_e2e" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.run">
<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_text</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_image</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_audio</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.run"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.run" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts">
<span class="sig-name descname"><span class="pre">setup_fake_prompts</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">visual_features</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">post_input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_lengths</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.setup_fake_prompts"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_qwen2vl">
<span class="sig-name descname"><span class="pre">setup_fake_prompts_qwen2vl</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">visual_features</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">vision_grid_thws</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">attention_mask</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_lengths</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.setup_fake_prompts_qwen2vl"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_qwen2vl" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_vila">
<span class="sig-name descname"><span class="pre">setup_fake_prompts_vila</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">visual_features</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">split_input_ids</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_lengths</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.setup_fake_prompts_vila"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_vila" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.setup_inputs">
<span class="sig-name descname"><span class="pre">setup_inputs</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_text</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">raw_image</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">raw_audio</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.setup_inputs"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_inputs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.split_prompt_by_images">
<span class="sig-name descname"><span class="pre">split_prompt_by_images</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.split_prompt_by_images"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.split_prompt_by_images" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.tokenizer_image_token">
<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">tokenizer_image_token</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">post_prompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">image_token_index</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-200</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.tokenizer_image_token"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.tokenizer_image_token" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.video_preprocess">
<span class="sig-name descname"><span class="pre">video_preprocess</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">video_path</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/multimodal_model_runner.html#MultimodalModelRunner.video_preprocess"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.video_preprocess" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">visual_engine_dir</span></span><a class="headerlink" href="#tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.QWenForCausalLMGenerationSession">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">QWenForCausalLMGenerationSession</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">model_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.ModelConfig" title="tensorrt_llm.runtime.generation.ModelConfig"><span class="pre">ModelConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">engine_buffer</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">mapping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Mapping</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">debug_tensors_to_save</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_mode</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Stream</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">global_max_input_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2048</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">global_max_output_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">4096</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#QWenForCausalLMGenerationSession"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.QWenForCausalLMGenerationSession" title="Link to this definition">#</a></dt>
<dd><p>Bases: <a class="reference internal" href="#tensorrt_llm.runtime.GenerationSession" title="tensorrt_llm.runtime.generation.GenerationSession"><code class="xref py py-class docutils literal notranslate"><span class="pre">GenerationSession</span></code></a></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.QWenForCausalLMGenerationSession.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">input_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.runtime.SamplingConfig" title="tensorrt_llm.runtime.generation.SamplingConfig"><span class="pre">SamplingConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">runtime_rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#QWenForCausalLMGenerationSession.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.QWenForCausalLMGenerationSession.generate" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">SamplingConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">20</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_beams</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_return_sequences</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_attention_window_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sink_token_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_sequence_lengths</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_words_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">numpy.ndarray</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">NoneType</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_words_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">numpy.ndarray</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">NoneType</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch.Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch.Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch.Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">min_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_beam_hyps</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">min_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">torch.Tensor</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.0</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#SamplingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.bad_words_list">
<span class="sig-name descname"><span class="pre">bad_words_list</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">list</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">ndarray</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.bad_words_list" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.beam_search_diversity_rate">
<span class="sig-name descname"><span class="pre">beam_search_diversity_rate</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.beam_search_diversity_rate" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.early_stopping">
<span class="sig-name descname"><span class="pre">early_stopping</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.early_stopping" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.end_id">
<span class="sig-name descname"><span class="pre">end_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.end_id" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.frequency_penalty">
<span class="sig-name descname"><span class="pre">frequency_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.frequency_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.length_penalty">
<span class="sig-name descname"><span class="pre">length_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.length_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.max_attention_window_size">
<span class="sig-name descname"><span class="pre">max_attention_window_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.max_attention_window_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.max_new_tokens">
<span class="sig-name descname"><span class="pre">max_new_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">20</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.max_new_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.min_length">
<span class="sig-name descname"><span class="pre">min_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.min_length" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.min_p">
<span class="sig-name descname"><span class="pre">min_p</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.min_p" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.no_repeat_ngram_size">
<span class="sig-name descname"><span class="pre">no_repeat_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.no_repeat_ngram_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.num_beams">
<span class="sig-name descname"><span class="pre">num_beams</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.num_beams" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.num_return_sequences">
<span class="sig-name descname"><span class="pre">num_return_sequences</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.num_return_sequences" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.output_cum_log_probs">
<span class="sig-name descname"><span class="pre">output_cum_log_probs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.output_cum_log_probs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.output_log_probs">
<span class="sig-name descname"><span class="pre">output_log_probs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.output_log_probs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.output_sequence_lengths">
<span class="sig-name descname"><span class="pre">output_sequence_lengths</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.output_sequence_lengths" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.pad_id">
<span class="sig-name descname"><span class="pre">pad_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.pad_id" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.presence_penalty">
<span class="sig-name descname"><span class="pre">presence_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.presence_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.random_seed">
<span class="sig-name descname"><span class="pre">random_seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.random_seed" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.repetition_penalty">
<span class="sig-name descname"><span class="pre">repetition_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.repetition_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.return_dict">
<span class="sig-name descname"><span class="pre">return_dict</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.return_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.sink_token_length">
<span class="sig-name descname"><span class="pre">sink_token_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.sink_token_length" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.stop_words_list">
<span class="sig-name descname"><span class="pre">stop_words_list</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">list</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">ndarray</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.stop_words_list" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.temperature">
<span class="sig-name descname"><span class="pre">temperature</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.temperature" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.top_k">
<span class="sig-name descname"><span class="pre">top_k</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.top_k" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.top_p">
<span class="sig-name descname"><span class="pre">top_p</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Tensor</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.top_p" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.top_p_decay">
<span class="sig-name descname"><span class="pre">top_p_decay</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.top_p_decay" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.top_p_min">
<span class="sig-name descname"><span class="pre">top_p_min</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.top_p_min" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.top_p_reset_ids">
<span class="sig-name descname"><span class="pre">top_p_reset_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.top_p_reset_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.update">
<span class="sig-name descname"><span class="pre">update</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#SamplingConfig.update"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.update" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.SamplingConfig.use_beam_hyps">
<span class="sig-name descname"><span class="pre">use_beam_hyps</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.runtime.SamplingConfig.use_beam_hyps" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">Session</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#Session"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.Session" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Session is a managed TensorRT runtime.</p>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.context">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">context</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">IExecutionContext</span></em><a class="headerlink" href="#tensorrt_llm.runtime.Session.context" title="Link to this definition">#</a></dt>
<dd><dl class="simple">
<dt>Get the default TensorRT execution context,</dt><dd><p>use self.engine.create_execution_context() to create a new context if needed</p>
</dd>
</dl>
<p>&#64;return: one TensorRT execution context object</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>&#64;brief</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.context_mem_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">context_mem_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.runtime.Session.context_mem_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.engine">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">engine</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ICudaEngine</span></em><a class="headerlink" href="#tensorrt_llm.runtime.Session.engine" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.from_engine">
<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_engine</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">engine</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.runtime.Session" title="tensorrt_llm.runtime.session.Session"><span class="pre">Session</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#Session.from_engine"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.Session.from_engine" title="Link to this definition">#</a></dt>
<dd><p>&#64;brief: Create a session from an existing ICudaEngine engine
&#64;param engine: an ICudaEngine
&#64;return: a Session object</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.from_serialized_engine">
<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_serialized_engine</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">engine</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.runtime.Session" title="tensorrt_llm.runtime.session.Session"><span class="pre">Session</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#Session.from_serialized_engine"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.Session.from_serialized_engine" title="Link to this definition">#</a></dt>
<dd><p>&#64;brief: Create a session from a serialized engine
&#64;param engine: a serialized engine
&#64;return: a Session object</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.infer_shapes">
<span class="sig-name descname"><span class="pre">infer_shapes</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.runtime.TensorInfo" title="tensorrt_llm.runtime.session.TensorInfo"><span class="pre">TensorInfo</span></a><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">IExecutionContext</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.runtime.TensorInfo" title="tensorrt_llm.runtime.session.TensorInfo"><span class="pre">TensorInfo</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#Session.infer_shapes"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.Session.infer_shapes" title="Link to this definition">#</a></dt>
<dd><dl class="simple">
<dt>&#64;brief: Set input shapes to given context, and infer the output shapes from the given input shapes.</dt><dd><p>This function should be called every time when the input shapes are changed before calling run().
Or call the context.set_input_shape on all dynamic shaped input tensors manually.</p>
</dd>
</dl>
<p>&#64;param inputs: list of TensorInfo object, each item represents an input tensor
&#64;param context: TensorRT execution context, if None, use the default context
&#64;return: list of TensorInfo object, each item represents an output tensor, returns None if failed</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.run">
<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">outputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stream</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#Session.run"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.Session.run" title="Link to this definition">#</a></dt>
<dd><p>&#64;brief: Run the TensorRT engine with the given inputs and outputs
&#64;param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor
&#64;param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor
&#64;param stream: cuda stream to enqueue the TensorRT engine on
&#64;param context: TensorRT execution context, if None, use the default context
&#64;return: True if enqueue succeeded, note the enqueue is an async call,</p>
<blockquote>
<div><p>returning True does not mean the execution is finished</p>
</div></blockquote>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.runtime">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">runtime</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Runtime</span></em><a class="headerlink" href="#tensorrt_llm.runtime.Session.runtime" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.Session.set_shapes">
<span class="sig-name descname"><span class="pre">set_shapes</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">tensor_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">IExecutionContext</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#Session.set_shapes"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.Session.set_shapes" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.StoppingCriteria">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">StoppingCriteria</span></span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#StoppingCriteria"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.StoppingCriteria" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Base class for all stopping criteria that can be applied during generation.</p>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.StoppingCriteriaList">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">StoppingCriteriaList</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">iterable</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">()</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">/</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#StoppingCriteriaList"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.StoppingCriteriaList" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">list</span></code>, <a class="reference internal" href="#tensorrt_llm.runtime.StoppingCriteria" title="tensorrt_llm.runtime.generation.StoppingCriteria"><code class="xref py py-class docutils literal notranslate"><span class="pre">StoppingCriteria</span></code></a></p>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">TensorInfo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="s"><span class="pre">'str'</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="s"><span class="pre">'trt.DataType'</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="s"><span class="pre">'tuple'</span></span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#TensorInfo"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo.dtype">
<span class="sig-name descname"><span class="pre">dtype</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">DataType</span></em><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo.dtype" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo.name">
<span class="sig-name descname"><span class="pre">name</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo.name" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo.numel">
<span class="sig-name descname"><span class="pre">numel</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#TensorInfo.numel"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo.numel" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo.shape">
<span class="sig-name descname"><span class="pre">shape</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">tuple</span></em><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo.shape" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo.squeeze">
<span class="sig-name descname"><span class="pre">squeeze</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#TensorInfo.squeeze"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo.squeeze" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.TensorInfo.view">
<span class="sig-name descname"><span class="pre">view</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/session.html#TensorInfo.view"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.TensorInfo.view" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="tensorrt_llm.runtime.decode_words_list">
<span class="sig-prename descclassname"><span class="pre">tensorrt_llm.runtime.</span></span><span class="sig-name descname"><span class="pre">decode_words_list</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">word_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/runtime/generation.html#decode_words_list"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.runtime.decode_words_list" title="Link to this definition">#</a></dt>
<dd><dl class="simple">
<dt>format of word_dict</dt><dd><p>len(word_dict) should be same to batch_size
word_dict[i] means the words for batch i
len(word_dict[i]) &gt;= 1, which means it must contain at least 1 string
For example, word_dict[2] = [” I am happy”, “ I am sad”].</p>
</dd>
</dl>
</dd></dl>
</section>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="tensorrt_llm.quantization.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Quantization</p>
</div>
</a>
<a class="right-next"
href="../_cpp_gen/executor.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Executor</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<dialog id="pst-secondary-sidebar-modal"></dialog>
<div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ChatGLMGenerationSession"><code class="docutils literal notranslate"><span class="pre">ChatGLMGenerationSession</span></code></a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.EncDecModelRunner"><code class="docutils literal notranslate"><span class="pre">EncDecModelRunner</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.EncDecModelRunner.encoder_run"><code class="docutils literal notranslate"><span class="pre">encoder_run()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.EncDecModelRunner.from_engine"><code class="docutils literal notranslate"><span class="pre">from_engine()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.EncDecModelRunner.generate"><code class="docutils literal notranslate"><span class="pre">generate()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.EncDecModelRunner.process_input"><code class="docutils literal notranslate"><span class="pre">process_input()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSequence"><code class="docutils literal notranslate"><span class="pre">GenerationSequence</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSequence.get_batch_idx"><code class="docutils literal notranslate"><span class="pre">get_batch_idx()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSequence.get_seq_idx"><code class="docutils literal notranslate"><span class="pre">get_seq_idx()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession"><code class="docutils literal notranslate"><span class="pre">GenerationSession</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.batch_size"><code class="docutils literal notranslate"><span class="pre">batch_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.buffer_allocated"><code class="docutils literal notranslate"><span class="pre">buffer_allocated</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.context_mem_size"><code class="docutils literal notranslate"><span class="pre">context_mem_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.conv_kernel"><code class="docutils literal notranslate"><span class="pre">conv_kernel</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.cross_attention"><code class="docutils literal notranslate"><span class="pre">cross_attention</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.cuda_graph_mode"><code class="docutils literal notranslate"><span class="pre">cuda_graph_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.cuda_stream_guard"><code class="docutils literal notranslate"><span class="pre">cuda_stream_guard()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.debug_mode"><code class="docutils literal notranslate"><span class="pre">debug_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.debug_tensors_to_save"><code class="docutils literal notranslate"><span class="pre">debug_tensors_to_save</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.decode"><code class="docutils literal notranslate"><span class="pre">decode()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.decode_batch"><code class="docutils literal notranslate"><span class="pre">decode_batch()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.decode_regular"><code class="docutils literal notranslate"><span class="pre">decode_regular()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.decode_stream"><code class="docutils literal notranslate"><span class="pre">decode_stream()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.device"><code class="docutils literal notranslate"><span class="pre">device</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.dtype"><code class="docutils literal notranslate"><span class="pre">dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.dump_debug_buffers"><code class="docutils literal notranslate"><span class="pre">dump_debug_buffers()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.early_stop_criteria"><code class="docutils literal notranslate"><span class="pre">early_stop_criteria()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.engine_inspector"><code class="docutils literal notranslate"><span class="pre">engine_inspector</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.filter_medusa_logits"><code class="docutils literal notranslate"><span class="pre">filter_medusa_logits()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.finalize_decoder"><code class="docutils literal notranslate"><span class="pre">finalize_decoder()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.find_best_medusa_path"><code class="docutils literal notranslate"><span class="pre">find_best_medusa_path()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.first_layer"><code class="docutils literal notranslate"><span class="pre">first_layer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">gather_context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">gather_generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.gemm_allreduce_plugin"><code class="docutils literal notranslate"><span class="pre">gemm_allreduce_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.get_next_medusa_tokens"><code class="docutils literal notranslate"><span class="pre">get_next_medusa_tokens()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.get_num_heads_kv"><code class="docutils literal notranslate"><span class="pre">get_num_heads_kv()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.handle_per_step"><code class="docutils literal notranslate"><span class="pre">handle_per_step()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.has_position_embedding"><code class="docutils literal notranslate"><span class="pre">has_position_embedding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.has_token_type_embedding"><code class="docutils literal notranslate"><span class="pre">has_token_type_embedding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.head_size"><code class="docutils literal notranslate"><span class="pre">head_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.hidden_size"><code class="docutils literal notranslate"><span class="pre">hidden_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.is_medusa_mode"><code class="docutils literal notranslate"><span class="pre">is_medusa_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.is_redrafter_mode"><code class="docutils literal notranslate"><span class="pre">is_redrafter_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.kv_cache_type"><code class="docutils literal notranslate"><span class="pre">kv_cache_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.last_layer"><code class="docutils literal notranslate"><span class="pre">last_layer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.locate_accepted_draft_tokens"><code class="docutils literal notranslate"><span class="pre">locate_accepted_draft_tokens()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.mapping"><code class="docutils literal notranslate"><span class="pre">mapping</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.max_draft_tokens"><code class="docutils literal notranslate"><span class="pre">max_draft_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">max_prompt_embedding_table_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.medusa_decode_and_verify"><code class="docutils literal notranslate"><span class="pre">medusa_decode_and_verify()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.medusa_paths"><code class="docutils literal notranslate"><span class="pre">medusa_paths</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.medusa_position_offsets"><code class="docutils literal notranslate"><span class="pre">medusa_position_offsets</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.medusa_temperature"><code class="docutils literal notranslate"><span class="pre">medusa_temperature</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.medusa_topks"><code class="docutils literal notranslate"><span class="pre">medusa_topks</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.medusa_tree_ids"><code class="docutils literal notranslate"><span class="pre">medusa_tree_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.next_medusa_input_ids"><code class="docutils literal notranslate"><span class="pre">next_medusa_input_ids()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.num_draft_tokens"><code class="docutils literal notranslate"><span class="pre">num_draft_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.num_heads"><code class="docutils literal notranslate"><span class="pre">num_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.num_layers"><code class="docutils literal notranslate"><span class="pre">num_layers</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.num_medusa_heads"><code class="docutils literal notranslate"><span class="pre">num_medusa_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.paged_kv_cache"><code class="docutils literal notranslate"><span class="pre">paged_kv_cache</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.paged_state"><code class="docutils literal notranslate"><span class="pre">paged_state</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.pp_communicate_final_output_ids"><code class="docutils literal notranslate"><span class="pre">pp_communicate_final_output_ids()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.pp_communicate_new_tokens"><code class="docutils literal notranslate"><span class="pre">pp_communicate_new_tokens()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.process_logits_including_draft"><code class="docutils literal notranslate"><span class="pre">process_logits_including_draft()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.profiler"><code class="docutils literal notranslate"><span class="pre">profiler</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.quant_mode"><code class="docutils literal notranslate"><span class="pre">quant_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.remove_input_padding"><code class="docutils literal notranslate"><span class="pre">remove_input_padding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.reorder_kv_cache_for_beam_search"><code class="docutils literal notranslate"><span class="pre">reorder_kv_cache_for_beam_search()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.rnn_conv_dim_size"><code class="docutils literal notranslate"><span class="pre">rnn_conv_dim_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.rnn_head_size"><code class="docutils literal notranslate"><span class="pre">rnn_head_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.rnn_hidden_size"><code class="docutils literal notranslate"><span class="pre">rnn_hidden_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.runtime"><code class="docutils literal notranslate"><span class="pre">runtime</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.setup"><code class="docutils literal notranslate"><span class="pre">setup()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.state_dtype"><code class="docutils literal notranslate"><span class="pre">state_dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.state_size"><code class="docutils literal notranslate"><span class="pre">state_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.tokens_per_block"><code class="docutils literal notranslate"><span class="pre">tokens_per_block</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.update_output_ids_by_offset"><code class="docutils literal notranslate"><span class="pre">update_output_ids_by_offset()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.use_gemm_allreduce_plugin"><code class="docutils literal notranslate"><span class="pre">use_gemm_allreduce_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.use_gpt_attention_plugin"><code class="docutils literal notranslate"><span class="pre">use_gpt_attention_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.use_kv_cache"><code class="docutils literal notranslate"><span class="pre">use_kv_cache</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.use_lora_plugin"><code class="docutils literal notranslate"><span class="pre">use_lora_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.use_mamba_conv1d_plugin"><code class="docutils literal notranslate"><span class="pre">use_mamba_conv1d_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.GenerationSession.vocab_size"><code class="docutils literal notranslate"><span class="pre">vocab_size</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.KVCacheManager"><code class="docutils literal notranslate"><span class="pre">KVCacheManager</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.KVCacheManager.add_sequence"><code class="docutils literal notranslate"><span class="pre">add_sequence()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.KVCacheManager.get_block_offsets"><code class="docutils literal notranslate"><span class="pre">get_block_offsets()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.KVCacheManager.step"><code class="docutils literal notranslate"><span class="pre">step()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.LogitsProcessor"><code class="docutils literal notranslate"><span class="pre">LogitsProcessor</span></code></a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.LogitsProcessorList"><code class="docutils literal notranslate"><span class="pre">LogitsProcessorList</span></code></a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig"><code class="docutils literal notranslate"><span class="pre">ModelConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.conv_kernel"><code class="docutils literal notranslate"><span class="pre">conv_kernel</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.cross_attention"><code class="docutils literal notranslate"><span class="pre">cross_attention</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.dtype"><code class="docutils literal notranslate"><span class="pre">dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">gather_context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">gather_generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.gemm_allreduce_plugin"><code class="docutils literal notranslate"><span class="pre">gemm_allreduce_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.gpt_attention_plugin"><code class="docutils literal notranslate"><span class="pre">gpt_attention_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.gpu_weights_percent"><code class="docutils literal notranslate"><span class="pre">gpu_weights_percent</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.has_position_embedding"><code class="docutils literal notranslate"><span class="pre">has_position_embedding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.has_token_type_embedding"><code class="docutils literal notranslate"><span class="pre">has_token_type_embedding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.head_size"><code class="docutils literal notranslate"><span class="pre">head_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.hidden_size"><code class="docutils literal notranslate"><span class="pre">hidden_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.kv_cache_type"><code class="docutils literal notranslate"><span class="pre">kv_cache_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.language_adapter_config"><code class="docutils literal notranslate"><span class="pre">language_adapter_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.layer_types"><code class="docutils literal notranslate"><span class="pre">layer_types</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.lora_plugin"><code class="docutils literal notranslate"><span class="pre">lora_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.lora_target_modules"><code class="docutils literal notranslate"><span class="pre">lora_target_modules</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.mamba_conv1d_plugin"><code class="docutils literal notranslate"><span class="pre">mamba_conv1d_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.max_batch_size"><code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.max_beam_width"><code class="docutils literal notranslate"><span class="pre">max_beam_width</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.max_medusa_tokens"><code class="docutils literal notranslate"><span class="pre">max_medusa_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">max_prompt_embedding_table_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.model_name"><code class="docutils literal notranslate"><span class="pre">model_name</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.num_heads"><code class="docutils literal notranslate"><span class="pre">num_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.num_kv_heads"><code class="docutils literal notranslate"><span class="pre">num_kv_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_cross_attn_layer"><code class="docutils literal notranslate"><span class="pre">num_kv_heads_per_cross_attn_layer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_layer"><code class="docutils literal notranslate"><span class="pre">num_kv_heads_per_layer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.num_layers"><code class="docutils literal notranslate"><span class="pre">num_layers</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.num_medusa_heads"><code class="docutils literal notranslate"><span class="pre">num_medusa_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.paged_state"><code class="docutils literal notranslate"><span class="pre">paged_state</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.quant_mode"><code class="docutils literal notranslate"><span class="pre">quant_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.redrafter_draft_len_per_beam"><code class="docutils literal notranslate"><span class="pre">redrafter_draft_len_per_beam</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.redrafter_num_beams"><code class="docutils literal notranslate"><span class="pre">redrafter_num_beams</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.remove_input_padding"><code class="docutils literal notranslate"><span class="pre">remove_input_padding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.rnn_conv_dim_size"><code class="docutils literal notranslate"><span class="pre">rnn_conv_dim_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.rnn_head_size"><code class="docutils literal notranslate"><span class="pre">rnn_head_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.rnn_hidden_size"><code class="docutils literal notranslate"><span class="pre">rnn_hidden_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.skip_cross_attn_blocks"><code class="docutils literal notranslate"><span class="pre">skip_cross_attn_blocks</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.skip_cross_kv"><code class="docutils literal notranslate"><span class="pre">skip_cross_kv</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.state_dtype"><code class="docutils literal notranslate"><span class="pre">state_dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.state_size"><code class="docutils literal notranslate"><span class="pre">state_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.tokens_per_block"><code class="docutils literal notranslate"><span class="pre">tokens_per_block</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.trtllm_modules_to_hf_modules"><code class="docutils literal notranslate"><span class="pre">trtllm_modules_to_hf_modules</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelConfig.vocab_size"><code class="docutils literal notranslate"><span class="pre">vocab_size</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner"><code class="docutils literal notranslate"><span class="pre">ModelRunner</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.dtype"><code class="docutils literal notranslate"><span class="pre">dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.from_dir"><code class="docutils literal notranslate"><span class="pre">from_dir()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.from_engine"><code class="docutils literal notranslate"><span class="pre">from_engine()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">gather_context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">gather_generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.generate"><code class="docutils literal notranslate"><span class="pre">generate()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.hidden_size"><code class="docutils literal notranslate"><span class="pre">hidden_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.mapping"><code class="docutils literal notranslate"><span class="pre">mapping</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">max_prompt_embedding_table_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.max_sequence_length"><code class="docutils literal notranslate"><span class="pre">max_sequence_length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.num_heads"><code class="docutils literal notranslate"><span class="pre">num_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.num_layers"><code class="docutils literal notranslate"><span class="pre">num_layers</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.remove_input_padding"><code class="docutils literal notranslate"><span class="pre">remove_input_padding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.serialize_engine"><code class="docutils literal notranslate"><span class="pre">serialize_engine()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.use_lora_plugin"><code class="docutils literal notranslate"><span class="pre">use_lora_plugin</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.vocab_size"><code class="docutils literal notranslate"><span class="pre">vocab_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunner.vocab_size_padded"><code class="docutils literal notranslate"><span class="pre">vocab_size_padded</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp"><code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.dtype"><code class="docutils literal notranslate"><span class="pre">dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.from_dir"><code class="docutils literal notranslate"><span class="pre">from_dir()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">gather_context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">gather_generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.generate"><code class="docutils literal notranslate"><span class="pre">generate()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.hidden_size"><code class="docutils literal notranslate"><span class="pre">hidden_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">max_prompt_embedding_table_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.max_sequence_length"><code class="docutils literal notranslate"><span class="pre">max_sequence_length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.num_heads"><code class="docutils literal notranslate"><span class="pre">num_heads</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.num_layers"><code class="docutils literal notranslate"><span class="pre">num_layers</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.remove_input_padding"><code class="docutils literal notranslate"><span class="pre">remove_input_padding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.vocab_size"><code class="docutils literal notranslate"><span class="pre">vocab_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.ModelRunnerCpp.vocab_size_padded"><code class="docutils literal notranslate"><span class="pre">vocab_size_padded</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner"><code class="docutils literal notranslate"><span class="pre">MultimodalModelRunner</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.audio_engine_dir"><code class="docutils literal notranslate"><span class="pre">audio_engine_dir</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.cpp_e2e"><code class="docutils literal notranslate"><span class="pre">cpp_e2e</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.cpp_llm_only"><code class="docutils literal notranslate"><span class="pre">cpp_llm_only</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.generate"><code class="docutils literal notranslate"><span class="pre">generate()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.get_audio_features"><code class="docutils literal notranslate"><span class="pre">get_audio_features()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.get_rope_index"><code class="docutils literal notranslate"><span class="pre">get_rope_index()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.get_visual_features"><code class="docutils literal notranslate"><span class="pre">get_visual_features()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_audio_encoder"><code class="docutils literal notranslate"><span class="pre">init_audio_encoder()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_image_encoder"><code class="docutils literal notranslate"><span class="pre">init_image_encoder()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_llm"><code class="docutils literal notranslate"><span class="pre">init_llm()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_processor"><code class="docutils literal notranslate"><span class="pre">init_processor()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.init_tokenizer"><code class="docutils literal notranslate"><span class="pre">init_tokenizer()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.llm_engine_dir"><code class="docutils literal notranslate"><span class="pre">llm_engine_dir</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.load_test_audio"><code class="docutils literal notranslate"><span class="pre">load_test_audio()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.load_test_data"><code class="docutils literal notranslate"><span class="pre">load_test_data()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.prepare_position_ids_for_cogvlm"><code class="docutils literal notranslate"><span class="pre">prepare_position_ids_for_cogvlm()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.preprocess"><code class="docutils literal notranslate"><span class="pre">preprocess()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup"><code class="docutils literal notranslate"><span class="pre">ptuning_setup()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_fuyu"><code class="docutils literal notranslate"><span class="pre">ptuning_setup_fuyu()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_llava_next"><code class="docutils literal notranslate"><span class="pre">ptuning_setup_llava_next()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_phi3"><code class="docutils literal notranslate"><span class="pre">ptuning_setup_phi3()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.python_e2e"><code class="docutils literal notranslate"><span class="pre">python_e2e</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.run"><code class="docutils literal notranslate"><span class="pre">run()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts"><code class="docutils literal notranslate"><span class="pre">setup_fake_prompts()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_qwen2vl"><code class="docutils literal notranslate"><span class="pre">setup_fake_prompts_qwen2vl()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_vila"><code class="docutils literal notranslate"><span class="pre">setup_fake_prompts_vila()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.setup_inputs"><code class="docutils literal notranslate"><span class="pre">setup_inputs()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.split_prompt_by_images"><code class="docutils literal notranslate"><span class="pre">split_prompt_by_images()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.tokenizer_image_token"><code class="docutils literal notranslate"><span class="pre">tokenizer_image_token()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.video_preprocess"><code class="docutils literal notranslate"><span class="pre">video_preprocess()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir"><code class="docutils literal notranslate"><span class="pre">visual_engine_dir</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.QWenForCausalLMGenerationSession"><code class="docutils literal notranslate"><span class="pre">QWenForCausalLMGenerationSession</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.QWenForCausalLMGenerationSession.generate"><code class="docutils literal notranslate"><span class="pre">generate()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig"><code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.bad_words_list"><code class="docutils literal notranslate"><span class="pre">bad_words_list</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.beam_search_diversity_rate"><code class="docutils literal notranslate"><span class="pre">beam_search_diversity_rate</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.early_stopping"><code class="docutils literal notranslate"><span class="pre">early_stopping</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.end_id"><code class="docutils literal notranslate"><span class="pre">end_id</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.frequency_penalty"><code class="docutils literal notranslate"><span class="pre">frequency_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.length_penalty"><code class="docutils literal notranslate"><span class="pre">length_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.max_attention_window_size"><code class="docutils literal notranslate"><span class="pre">max_attention_window_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.max_new_tokens"><code class="docutils literal notranslate"><span class="pre">max_new_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.min_length"><code class="docutils literal notranslate"><span class="pre">min_length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.min_p"><code class="docutils literal notranslate"><span class="pre">min_p</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.no_repeat_ngram_size"><code class="docutils literal notranslate"><span class="pre">no_repeat_ngram_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.num_beams"><code class="docutils literal notranslate"><span class="pre">num_beams</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.num_return_sequences"><code class="docutils literal notranslate"><span class="pre">num_return_sequences</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.output_cum_log_probs"><code class="docutils literal notranslate"><span class="pre">output_cum_log_probs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.output_log_probs"><code class="docutils literal notranslate"><span class="pre">output_log_probs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.output_sequence_lengths"><code class="docutils literal notranslate"><span class="pre">output_sequence_lengths</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.pad_id"><code class="docutils literal notranslate"><span class="pre">pad_id</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.presence_penalty"><code class="docutils literal notranslate"><span class="pre">presence_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.random_seed"><code class="docutils literal notranslate"><span class="pre">random_seed</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.repetition_penalty"><code class="docutils literal notranslate"><span class="pre">repetition_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.return_dict"><code class="docutils literal notranslate"><span class="pre">return_dict</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.sink_token_length"><code class="docutils literal notranslate"><span class="pre">sink_token_length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.stop_words_list"><code class="docutils literal notranslate"><span class="pre">stop_words_list</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.temperature"><code class="docutils literal notranslate"><span class="pre">temperature</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.top_k"><code class="docutils literal notranslate"><span class="pre">top_k</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.top_p"><code class="docutils literal notranslate"><span class="pre">top_p</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.top_p_decay"><code class="docutils literal notranslate"><span class="pre">top_p_decay</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.top_p_min"><code class="docutils literal notranslate"><span class="pre">top_p_min</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.top_p_reset_ids"><code class="docutils literal notranslate"><span class="pre">top_p_reset_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.update"><code class="docutils literal notranslate"><span class="pre">update()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.SamplingConfig.use_beam_hyps"><code class="docutils literal notranslate"><span class="pre">use_beam_hyps</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session"><code class="docutils literal notranslate"><span class="pre">Session</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.context"><code class="docutils literal notranslate"><span class="pre">context</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.context_mem_size"><code class="docutils literal notranslate"><span class="pre">context_mem_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.engine"><code class="docutils literal notranslate"><span class="pre">engine</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.from_engine"><code class="docutils literal notranslate"><span class="pre">from_engine()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.from_serialized_engine"><code class="docutils literal notranslate"><span class="pre">from_serialized_engine()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.infer_shapes"><code class="docutils literal notranslate"><span class="pre">infer_shapes()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.run"><code class="docutils literal notranslate"><span class="pre">run()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.runtime"><code class="docutils literal notranslate"><span class="pre">runtime</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.Session.set_shapes"><code class="docutils literal notranslate"><span class="pre">set_shapes()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.StoppingCriteria"><code class="docutils literal notranslate"><span class="pre">StoppingCriteria</span></code></a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.StoppingCriteriaList"><code class="docutils literal notranslate"><span class="pre">StoppingCriteriaList</span></code></a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo"><code class="docutils literal notranslate"><span class="pre">TensorInfo</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo.dtype"><code class="docutils literal notranslate"><span class="pre">dtype</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo.name"><code class="docutils literal notranslate"><span class="pre">name</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo.numel"><code class="docutils literal notranslate"><span class="pre">numel()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo.shape"><code class="docutils literal notranslate"><span class="pre">shape</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo.squeeze"><code class="docutils literal notranslate"><span class="pre">squeeze()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.TensorInfo.view"><code class="docutils literal notranslate"><span class="pre">view()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.runtime.decode_words_list"><code class="docutils literal notranslate"><span class="pre">decode_words_list()</span></code></a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<a class="footer-brand logo" href="https://www.nvidia.com">
<img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
<img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
</a></div>
<div class="footer-item">
<div class="footer-links">
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
|
<a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
|
<a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
|
<a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
</div>
</div>
<div class="footer-item">
<p class="copyright">
Copyright © 2025, NVidia.
<br/>
</p>
</div>
</div>
</div>
</footer>
</body>
</html>