TensorRT-LLMs/_modules/tensorrt_llm/sampling_params.html
2025-09-30 03:07:06 +00:00

1142 lines
106 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en" data-content_root="../../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>tensorrt_llm.sampling_params &#8212; TensorRT LLM</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!--
this give us a css class that will be invisible only if js is disabled
-->
<noscript>
<style>
.pst-js-only { display: none !important; }
</style>
</noscript>
<!-- Loaded before other Sphinx assets -->
<link href="../../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="../../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
<link rel="stylesheet" type="text/css" href="../../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../../_static/autodoc_pydantic.css" />
<link rel="stylesheet" type="text/css" href="../../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../../_static/custom.css?v=19d20f17" />
<!-- So that users can add custom icons -->
<script src="../../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
<script src="../../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../../_static/doctools.js?v=9a2dae69"></script>
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../../_static/copybutton.js?v=65e89d2a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/tensorrt_llm/sampling_params';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
<link rel="icon" href="../../_static/favicon.png"/>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.2.0rc0" />
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<dialog id="pst-search-dialog">
<form class="bd-search d-flex align-items-center"
action="../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
</dialog>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
<button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
<span class="fa-solid fa-bars"></span>
</button>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../index.html">
<img src="../../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT LLM - Home"/>
<img src="../../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT LLM - Home"/>
<p class="title logo__title">TensorRT LLM</p>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<div class="version-switcher__container dropdown pst-js-only">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<dialog id="pst-primary-sidebar-modal"></dialog>
<div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
<a class="navbar-brand logo" href="../../index.html">
<img src="../../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT LLM - Home"/>
<img src="../../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT LLM - Home"/>
<p class="title logo__title">TensorRT LLM</p>
</a>
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<div class="version-switcher__container dropdown pst-js-only">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Table of Contents">
<p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
<div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../installation/index.html">Installation</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../../installation/containers.html">Pre-built release container images on NGC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../installation/linux.html">Installing on Linux via <code class="docutils literal notranslate"><span class="pre">pip</span></code></a></li>
<li class="toctree-l2"><a class="reference internal" href="../../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Deployment Guide</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate text asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate text in streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_kv_cache_connector.html">KV Cache Connector</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_kv_cache_offloading.html">KV Cache Offloading</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
</ul>
</details></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_chat_client.html">Curl Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client for Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client_for_lora.html">Openai Completion Client For Lora</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client_json_schema.html">OpenAI Completion Client with JSON Schema</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../../examples/dynamo_k8s_example.html">Dynamo K8s Example</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../deployment-guide/index.html">Model Recipes</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.html">Quick Start Recipe for DeepSeek R1 on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.html">Quick Start Recipe for Llama3.3 70B on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.html">Quick Start Recipe for Llama4 Scout 17B on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.html">Quick Start Recipe for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Models</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../models/supported-models.html">Supported Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../models/adding-new-model.html">Adding a New Model</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-eval.html">trtllm-eval</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">API Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../llm-api/index.html">LLM API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../llm-api/reference.html">API Reference</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Features</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../features/feature-combination-matrix.html">Feature Combination Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/disagg-serving.html">Disaggregated Serving (Beta)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/kvcache.html">KV Cache System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/long-sequence.html">Long Sequences</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/lora.html">LoRA (Low-Rank Adaptation)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/multi-modality.html">Multimodal Support in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/overlap-scheduler.html">Overlap Scheduler</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/paged-attention-ifb-scheduler.html">Paged Attention, IFB, and Request Scheduling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/parallel-strategy.html">Parallelism in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/sampling.html">Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/speculative-decoding.html">Speculative Decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../developer-guide/overview.html">Architecture Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developer-guide/perf-analysis.html">Performance Analysis</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developer-guide/perf-benchmarking.html">TensorRT LLM Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developer-guide/ci-overview.html">Continuous Integration Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developer-guide/dev-containers.html">Using Dev Containers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developer-guide/api-change.html">LLM API Change Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog10_ADP_Balance_Strategy.html">ADP Balance Strategy</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog11_GPT_OSS_Eagle3.html">Running GPT-OSS-120B with Eagle3 Speculative Decoding on GB200/B200 (TensorRT LLM)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html">Combining Guided Decoding and Speculative Decoding: Making CPU and GPU Cooperate Seamlessly</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html">Inference Time Compute Implementation in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html">Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html">Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html">Disaggregated Serving in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html">How to launch Llama4 Maverick + Eagle3 TensorRT LLM server</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html">N-GramSpeculativeDecodingin TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html">How to get best performance on DeepSeek-R1 in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Quick Links</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/releases">Releases</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM">Github Code</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap">Roadmap</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb" class="d-print-none">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">tensorrt_llm.sampling_params</span></li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<h1>Source code for tensorrt_llm.sampling_params</h1><div class="highlight"><pre>
<span></span><span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">abc</span><span class="w"> </span><span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">fields</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">List</span><span class="p">,</span> <span class="n">NamedTuple</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="n">BaseModel</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings</span><span class="w"> </span><span class="kn">import</span> <span class="n">executor</span> <span class="k">as</span> <span class="n">tllme</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
<div class="viewcode-block" id="GuidedDecodingParams">
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.GuidedDecodingParams">[docs]</a>
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">GuidedDecodingParams</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Guided decoding parameters for text generation. Only one of the fields could be effective.</span>
<span class="sd"> Args:</span>
<span class="sd"> json (str, pydantic.main.BaseModel, dict, optional): The generated text is amenable to json format with additional user-specified restrictions, namely schema. Defaults to None.</span>
<span class="sd"> regex (str, optional): The generated text is amenable to the user-specified regular expression. Defaults to None.</span>
<span class="sd"> grammar (str, optional): The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None.</span>
<span class="sd"> json_object (bool): If True, the generated text is amenable to json format. Defaults to False.</span>
<span class="sd"> structural_tag (str, optional): The generated text is amenable to the user-specified structural tag. Structural tag is supported by xgrammar backend only. Defaults to None.</span>
<span class="sd"> &quot;&quot;&quot;</span> <span class="c1"># noqa: E501</span>
<span class="n">json</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="nb">dict</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">regex</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">grammar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">json_object</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">structural_tag</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">num_guides</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">_field</span> <span class="ow">in</span> <span class="n">fields</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">num_guides</span> <span class="o">+=</span> <span class="nb">bool</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">_field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
<span class="k">if</span> <span class="n">num_guides</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Only one guide can be used for a request, but got </span><span class="si">{</span><span class="n">num_guides</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span></div>
<span class="k">class</span><span class="w"> </span><span class="nc">LogprobParams</span><span class="p">(</span><span class="n">NamedTuple</span><span class="p">):</span>
<span class="n">prompt_logprobs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">logprobs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># Drop the logits once the logprobs are computed</span>
<span class="n">drop_context_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="c1"># Drop the geneation_logits once the logprobs are computed</span>
<span class="n">drop_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">class</span><span class="w"> </span><span class="nc">LogitsProcessor</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Base class for logits processor.</span>
<span class="sd"> The recommended way to create a customized logits processor:</span>
<span class="sd"> * Subclass this class and implement the processing logics in the __call__ method.</span>
<span class="sd"> * Create an instance and pass to SamplingParams.</span>
<span class="sd"> Alternatively, you can create any callable with the same signature with the __call__ method.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">req_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">logits</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span>
<span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]],</span>
<span class="n">stream_ptr</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
<span class="n">client_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Logits processing callback. The callback is expected to inplace modify the logits.</span>
<span class="sd"> Args:</span>
<span class="sd"> req_id (int): Request id.</span>
<span class="sd"> logits (torch.Tensor): Logits tensor to be modified.</span>
<span class="sd"> token_ids (List[List[int]]): Token ids produced by the request so far.</span>
<span class="sd"> The shape is beam_width * sequence_length.</span>
<span class="sd"> stream_ptr (int, optional): The operation stream used by the logits tensor.</span>
<span class="sd"> Not required for PyTorch backend.</span>
<span class="sd"> client_id (int, optional): An optional client id.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">pass</span> <span class="c1"># noqa</span>
<span class="k">class</span><span class="w"> </span><span class="nc">BatchedLogitsProcessor</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Base class for batched logits processor.</span>
<span class="sd"> The recommended way to create a customized batched logits processor:</span>
<span class="sd"> * Subclass this class and implement the processing logics in the __call__ method.</span>
<span class="sd"> * Create an instance and pass to LLM.</span>
<span class="sd"> Alternatively, you can create any callable with the same signature with the __call__ method.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">req_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
<span class="n">logits</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span>
<span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]],</span>
<span class="n">stream_ptr</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">client_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Batched logits processing callback. The callback is expected to inplace modify the logits.</span>
<span class="sd"> Args:</span>
<span class="sd"> req_ids (List[int]): A batch of request ids.</span>
<span class="sd"> logits (List[torch.Tensor]): A batch of the logits tensors.</span>
<span class="sd"> token_ids (List[List[List[int]]]): A batch of the token ids produced by the requests so far.</span>
<span class="sd"> The shape is batch * beam_width * sequence_length.</span>
<span class="sd"> stream_ptr (int): The operation stream used by the logits tensors.</span>
<span class="sd"> client_ids (List[Optional[int]]): A batch of optional client ids.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">pass</span> <span class="c1"># noqa</span>
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">AdditionalModelOutput</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;An additional output to gather from the model.</span>
<span class="sd"> Args:</span>
<span class="sd"> name (str): The name of the additional output to gather from the model.</span>
<span class="sd"> gather_context (bool): A value indicating whether or not to gather the additional output from the context too. Defaults to False.</span>
<span class="sd"> &quot;&quot;&quot;</span> <span class="c1"># noqa: E501</span>
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span>
<span class="n">gather_context</span><span class="p">:</span> <span class="nb">bool</span>
<div class="viewcode-block" id="SamplingParams">
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams">[docs]</a>
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">SamplingParams</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sampling parameters for text generation.</span>
<span class="sd"> Usage Examples:</span>
<span class="sd"> use_beam_search is False:</span>
<span class="sd"> - best_of is None: (top-p/top-k) sampling n responses and return n generations</span>
<span class="sd"> - best_of is not None: (top-p/top-k) sampling best_of responses and return n generations (best_of &gt;= n must hold)</span>
<span class="sd"> use_beam_search is True:</span>
<span class="sd"> - best_of is None: beam search with beam width of n, return n generations</span>
<span class="sd"> - best_of is not None: beam search with beam width of best_of, return n generations (best_of &gt;= n must hold)</span>
<span class="sd"> Args:</span>
<span class="sd"> end_id (int, optional): The end token id. Defaults to None.</span>
<span class="sd"> pad_id (int, optional): The pad token id. Defaults to None.</span>
<span class="sd"> max_tokens (int): The maximum number of tokens to generate. Defaults to 32.</span>
<span class="sd"> bad (str, List[str], optional): A string or a list of strings that redirect the generation when they are generated, so that the bad strings are excluded from the returned output. Defaults to None.</span>
<span class="sd"> bad_token_ids (List[int], optional): A list of token ids that redirect the generation when they are generated, so that the bad ids are excluded from the returned output. Defaults to None.</span>
<span class="sd"> stop (str, List[str], optional): A string or a list of strings that stop the generation when they are generated. The returned output will not contain the stop strings unless include_stop_str_in_output is True. Defaults to None.</span>
<span class="sd"> stop_token_ids (List[int], optional): A list of token ids that stop the generation when they are generated. Defaults to None.</span>
<span class="sd"> include_stop_str_in_output (bool): Whether to include the stop strings in output text. Defaults to False.</span>
<span class="sd"> embedding_bias (torch.Tensor, optional): The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. Defaults to None.</span>
<span class="sd"> logits_processor (tensorrt_llm.sampling_params.LogitsProcessor, List[tensorrt_llm.sampling_params.LogitsProcessor], optional): The logits postprocessor callback(s). Defaults to None.</span>
<span class="sd"> If a list, each processor is applied in order during generation (supported in PyTorch backend only).</span>
<span class="sd"> apply_batched_logits_processor (bool): Whether to apply batched logits postprocessor callback. Defaults to False.</span>
<span class="sd"> The BatchedLogitsProcessor class is recommended for callback creation. The callback must be provided when initializing LLM.</span>
<span class="sd"> n (int): Number of sequences to generate. Defaults to 1.</span>
<span class="sd"> best_of (int, optional): Number of sequences to consider for best output. Defaults to None.</span>
<span class="sd"> use_beam_search (bool): Whether to use beam search. Defaults to False.</span>
<span class="sd"> top_k (int, optional): Controls number of logits to sample from. None means using C++ runtime default 0, i.e., all logits. Defaults to None.</span>
<span class="sd"> top_p (float, optional): Controls the top-P probability to sample from. None means using C++ runtime default 0.f. Defaults to None.</span>
<span class="sd"> top_p_min (float, optional): Controls decay in the top-P algorithm. topPMin is lower-bound. None means using C++ runtime default 1.e-6. Defaults to None.</span>
<span class="sd"> top_p_reset_ids (int, optional): Controls decay in the top-P algorithm. Indicates where to reset the decay. None means using C++ runtime default 1. Defaults to None.</span>
<span class="sd"> top_p_decay (float, optional): Controls decay in the top-P algorithm. The decay value. None means using C++ runtime default 1.f. Defaults to None.</span>
<span class="sd"> seed (int, optional): Controls the random seed used by the random number generator in sampling. None means using C++ runtime default 0. Defaults to None.</span>
<span class="sd"> temperature (float, optional): Controls the modulation of logits when sampling new tokens. It can have values &gt; 0.f. None means using C++ runtime default 1.0f. Defaults to None.</span>
<span class="sd"> min_tokens (int, optional): Lower bound on the number of tokens to generate. Values &lt; 1 have no effect. None means using C++ runtime default 1. Defaults to None.</span>
<span class="sd"> beam_search_diversity_rate (float, optional): Used to penalize tokens based on how often they appear in the sequence. It can have any value &gt; 0.f. Values &lt; 1.f encourages repetition, values &gt; 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.</span>
<span class="sd"> repetition_penalty (float, optional): Used to penalize tokens based on how often they appear in the sequence. It can have any value &gt; 0.f. Values &lt; 1.f encourages repetition, values &gt; 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.</span>
<span class="sd"> presence_penalty (float, optional): Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values &lt; 0.f encourage repetition, values &gt; 0.f discourage it. None means using C++ runtime default 0.f. Defaults to None.</span>
<span class="sd"> frequency_penalty (float, optional): Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values &lt; 0.f encourage repetition, values &gt; 0.f discourage it. None means using C++ runtime default 0.f. Defaults to None.</span>
<span class="sd"> length_penalty (float, optional): Controls how to penalize longer sequences in beam search. None means using C++ runtime default 0.f. Defaults to None.</span>
<span class="sd"> early_stopping (int, optional): Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token). None means using C++ runtime default 1. Defaults to None.</span>
<span class="sd"> no_repeat_ngram_size (int, optional): Controls how many repeat ngram size are acceptable. None means using C++ runtime default 1 &lt;&lt; 30. Defaults to None.</span>
<span class="sd"> min_p (float, optional): scale the most likely token to determine the minimum token probability. None means using C++ runtime default 0.0. Defaults to None.</span>
<span class="sd"> beam_width_array (List[int], optional): The array of beam width using in Variable-Beam-Width-Search. Defaults to None.</span>
<span class="sd"> logprobs (int, optional): Number of log probabilities to return per output token. Defaults to None.</span>
<span class="sd"> prompt_logprobs (int, optional): Number of log probabilities to return per prompt token. Defaults to None.</span>
<span class="sd"> return_context_logits (bool): Controls if Result should contain the context logits. Defaults to False.</span>
<span class="sd"> return_generation_logits (bool): Controls if Result should contain the generation logits. Defaults to False.</span>
<span class="sd"> exclude_input_from_output (bool): Controls if output tokens in Result should include the input tokens. Defaults to True.</span>
<span class="sd"> return_encoder_output (bool): Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Defaults to False.</span>
<span class="sd"> return_perf_metrics (bool): Controls if Result should contain the performance metrics for this request. Defaults to False.</span>
<span class="sd"> additional_model_outputs (List[tensorrt_llm.sampling_params.AdditionalModelOutput], optional): The additional outputs to gather from the model. Defaults to None.</span>
<span class="sd"> lookahead_config (tensorrt_llm.bindings.executor.LookaheadDecodingConfig , optional): Lookahead decoding config. Defaults to None.</span>
<span class="sd"> guided_decoding (tensorrt_llm.sampling_params.GuidedDecodingParams, optional): Guided decoding params. Defaults to None.</span>
<span class="sd"> ignore_eos (bool): Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. Defaults to False.</span>
<span class="sd"> detokenize (bool): Whether to detokenize the output. Defaults to True.</span>
<span class="sd"> add_special_tokens (bool): Whether to add special tokens to the prompt. Defaults to True.</span>
<span class="sd"> truncate_prompt_tokens (int, optional): If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None.</span>
<span class="sd"> skip_special_tokens (bool): Whether to skip special tokens in the output. Defaults to True.</span>
<span class="sd"> spaces_between_special_tokens (bool): Whether to add spaces between special tokens in the output. Defaults to True.</span>
<span class="sd"> &quot;&quot;&quot;</span> <span class="c1"># noqa: E501</span>
<span class="c1"># [TO DEVELOPER] This class provides an interface to LLMAPI users.</span>
<span class="c1"># Internally, it manages and dispatches fields to Python bindings of C++ objects, currently including:</span>
<span class="c1"># (1) all fields of tllme.SamplingConfig;</span>
<span class="c1"># (2) all fields of tllme.OutputConfig;</span>
<span class="c1"># (3) some fields of tllme.Request.</span>
<span class="c1"># If you changed the implementation of C++ objects and corresponding Python bindings, please update:</span>
<span class="c1"># (1) the fields and corresponding docstring of this class, and</span>
<span class="c1"># (2) the expected_fields defined in _get_xxx_config methods.</span>
<span class="n">end_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">pad_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">max_tokens</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">32</span>
<span class="n">bad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">bad_token_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">_bad_word_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">stop</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">stop_token_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">include_stop_str_in_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">_stop_word_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">embedding_bias</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">logits_processor</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">LogitsProcessor</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">LogitsProcessor</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">apply_batched_logits_processor</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
<span class="n">best_of</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">use_beam_search</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="c1"># Keep the below fields in sync with tllme.SamplingConfig or maintin the mapping table.</span>
<span class="n">top_k</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">top_p</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">top_p_min</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">top_p_reset_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">top_p_decay</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">temperature</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">min_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">beam_search_diversity_rate</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">repetition_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">presence_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">frequency_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">length_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">early_stopping</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">no_repeat_ngram_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">min_p</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">beam_width_array</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># Keep the below fields in sync with tllme.OutputConfig</span>
<span class="n">logprobs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">prompt_logprobs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">return_context_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">return_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">exclude_input_from_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">return_encoder_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">return_perf_metrics</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">additional_model_outputs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">AdditionalModelOutput</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># Used in logprobs calculation in TRT flow to drop logits early if user did not explicitly request them.</span>
<span class="c1"># Can be deprecated after migration to PyTorch backend.</span>
<span class="n">_context_logits_auto_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">_generation_logits_auto_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="c1"># TODO: deprecate this after trtllm-serve migrate to use TopK logprobs</span>
<span class="n">_return_log_probs</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="c1"># Lookahead decoding config</span>
<span class="n">lookahead_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">tllme</span><span class="o">.</span><span class="n">LookaheadDecodingConfig</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># Guided decoding params</span>
<span class="n">guided_decoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">GuidedDecodingParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="c1"># Tokenizer-related configs</span>
<span class="n">ignore_eos</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">detokenize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">add_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">truncate_prompt_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">skip_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">spaces_between_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="c1"># Currently, _stream_interval is only used to pass llm.args.stream_interval to tokenizer.</span>
<span class="c1"># TODO: make this a per-request parameter.</span>
<span class="n">_stream_interval</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_id</span>
<span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span><span class="o">.</span><span class="n">detach</span><span class="p">()</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate</span><span class="p">()</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Verify the sampling parameters.</span>
<span class="sd"> This function verifies the sampling parameters in the LLM API, which</span>
<span class="sd"> may have stricter requirements than the Executor class of C++ runtime.</span>
<span class="sd"> For instance, while the greedy decoding with n &gt; 1 is capable in the</span>
<span class="sd"> Executor class of C++ runtime, the LLM API disallows such combination.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;best_of (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot be less than n (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="mi">1</span>
<span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_greedy_decoding</span>
<span class="ow">and</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;TLLM_ALLOW_N_GREEDY_DECODING&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Greedy decoding in the LLM API does not allow multiple &quot;</span>
<span class="sa">f</span><span class="s2">&quot;returns. Please set to best_of=1, got best_of=</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">. &quot;</span>
<span class="sa">f</span><span class="s2">&quot;Please set to best_of=1 or set an environment variable &quot;</span>
<span class="sa">f</span><span class="s2">&quot;TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of &gt; 1 &quot;</span>
<span class="sa">f</span><span class="s2">&quot;under the greedy decoding.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="o">&lt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;truncate_prompt_tokens must be &gt;= 1, got </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">_validate</span><span class="p">()</span>
<span class="c1"># correct types as users might pass in logprob=True for Top-1 logprobs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span> <span class="ow">and</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_greedy_decoding</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span>
<span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span>
<span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span>
<span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">top_p</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_p</span> <span class="o">==</span> <span class="mf">0.0</span><span class="p">)</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_need_return_context_logits</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">return_context_logits</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_context_logits_auto_enabled</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_need_return_generation_logits</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">return_generation_logits</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generation_logits_auto_enabled</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_setup</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">tokenizer</span><span class="p">,</span> <span class="n">hf_model_config</span><span class="p">,</span> <span class="n">generation_config</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SamplingParams&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">end_id</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">eos_token_id</span>
<span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span>
<span class="c1"># kimi_k2 model uses the eos_token_id in generation config</span>
<span class="k">if</span> <span class="p">(</span>
<span class="n">hf_model_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="ow">and</span> <span class="n">hf_model_config</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s2">&quot;kimi_k2&quot;</span>
<span class="ow">and</span> <span class="n">generation_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">generation_config</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span>
<span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">end_id</span> <span class="o">=</span> <span class="n">generation_config</span><span class="o">.</span><span class="n">eos_token_id</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_id</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_encode</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">text</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="o">=</span><span class="n">add_special_tokens</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
<span class="c1"># For tiktokenizer, the encode method does not have add_special_tokens argument</span>
<span class="k">return</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">strs</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">bad</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bad</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_bad_word_ids</span> <span class="o">=</span> <span class="p">[</span><span class="n">_encode</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">s</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">strs</span><span class="p">]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">strs</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="o">=</span> <span class="p">[</span><span class="n">_encode</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">s</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">strs</span><span class="p">]</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_bad_words</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span>
<span class="n">words</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad_token_ids</span><span class="p">:</span>
<span class="n">words</span> <span class="o">=</span> <span class="p">[[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad_token_ids</span><span class="p">]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">words</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_bad_word_ids</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.bad (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">bad</span><span class="si">}</span><span class="s2">) is not processed by tokenizer, &quot;</span>
<span class="s2">&quot;please call the setup method.&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">words</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_bad_word_ids</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_stop_words</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span>
<span class="n">words</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span><span class="p">:</span>
<span class="n">words</span> <span class="o">=</span> <span class="p">[[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span><span class="p">]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">words</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.stop (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="si">}</span><span class="s2">) is not processed by tokenizer, &quot;</span>
<span class="s2">&quot;please call the setup method.&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">words</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_stop_reasons_and_words</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]]:</span>
<span class="n">stop_reasons</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">stop_reasons</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">stop_reasons</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">stop_reasons</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">)</span>
<span class="n">stop_words</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_stop_words</span><span class="p">()</span>
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">stop_reasons</span><span class="p">,</span> <span class="n">stop_words</span><span class="p">))</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_sampling_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">tllme</span><span class="o">.</span><span class="n">SamplingConfig</span><span class="p">:</span>
<span class="c1"># A map from the SamplingConfig fields of the LLM API to their</span>
<span class="c1"># corresponding field names of the Executor of TRT-LLM C++ runtime.</span>
<span class="c1"># In sampling, there is no parameter that directly matches &#39;best_of&#39;,</span>
<span class="c1"># so outputs must be trimmed during postprocessing.</span>
<span class="c1"># | LLM API | TRT-LLM Executor |</span>
<span class="c1"># --------------|-----------------|------------------------|</span>
<span class="c1"># | Beam search | use_beam_search | beam_width &gt; 1 |</span>
<span class="c1"># | Beam search | n | num_return_sequences |</span>
<span class="c1"># | Beam search | best_of | beam_width |</span>
<span class="c1"># |-------------|-----------------|------------------------|</span>
<span class="c1"># | Sampling | use_beam_search | beam_width == 1 |</span>
<span class="c1"># | Sampling | n | num_return_sequences |</span>
<span class="c1"># | Sampling | best_of | no corresponding param |</span>
<span class="n">fields</span> <span class="o">=</span> <span class="p">{</span><span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">tllme</span><span class="o">.</span><span class="n">SamplingConfig</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">f</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;__&quot;</span><span class="p">)}</span>
<span class="n">unmatched_params</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;num_return_sequences&quot;</span><span class="p">,</span>
<span class="s2">&quot;beam_width&quot;</span><span class="p">,</span>
<span class="s2">&quot;n&quot;</span><span class="p">,</span>
<span class="s2">&quot;best_of&quot;</span><span class="p">,</span>
<span class="s2">&quot;use_beam_search&quot;</span><span class="p">,</span>
<span class="p">]</span>
<span class="n">llmapi_to_rt_param_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">f</span><span class="p">:</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">fields</span> <span class="k">if</span> <span class="n">f</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">unmatched_params</span><span class="p">}</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span><span class="p">:</span>
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s2">&quot;num_return_sequences&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span>
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s2">&quot;beam_width&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s2">&quot;num_return_sequences&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span>
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s2">&quot;beam_width&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">SamplingConfig</span><span class="p">(</span><span class="o">**</span><span class="n">llmapi_to_rt_param_map</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_output_config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">is_pytorch_backend</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">tllme</span><span class="o">.</span><span class="n">OutputConfig</span><span class="p">:</span>
<span class="n">sampling_param_fields</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">dir</span><span class="p">(</span><span class="n">SamplingParams</span><span class="p">))</span>
<span class="n">fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">f</span>
<span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">tllme</span><span class="o">.</span><span class="n">OutputConfig</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">f</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;__&quot;</span><span class="p">)</span> <span class="ow">and</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">sampling_param_fields</span>
<span class="p">]</span>
<span class="n">config_kwargs</span> <span class="o">=</span> <span class="p">{</span><span class="n">f</span><span class="p">:</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">fields</span><span class="p">}</span>
<span class="k">if</span> <span class="n">is_pytorch_backend</span><span class="p">:</span>
<span class="n">config_kwargs</span><span class="p">[</span><span class="s2">&quot;return_log_probs&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="nb">bool</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">return_context_logits</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
<span class="s2">&quot;Since prompt_logprobs is requested but return_context_logits is False, &quot;</span>
<span class="s2">&quot;internally enabling context logits for prompt logprobs computation. &quot;</span>
<span class="s2">&quot;context logits will be dropped after computation as the user didn&#39;t explicitly request them.&quot;</span>
<span class="p">)</span>
<span class="c1"># TODO: Find a more elegant way to do this.</span>
<span class="c1"># NOTE: This is an internal hack, so we can entirely avoid introducing</span>
<span class="c1"># `prompt_logprobs` into the executor bindings and further into</span>
<span class="c1"># model engine / sampler.</span>
<span class="c1"># This is because, prompt_logprobs is a derived quantity from</span>
<span class="c1"># context logits, and the capability to post-compute it</span>
<span class="c1"># already exists in the worker. (see _get_logprobs in worker.py)</span>
<span class="n">config_kwargs</span><span class="p">[</span><span class="s2">&quot;return_context_logits&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">config_kwargs</span><span class="p">[</span><span class="s2">&quot;return_log_probs&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_return_log_probs</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">OutputConfig</span><span class="p">(</span><span class="o">**</span><span class="n">config_kwargs</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_get_guided_decoding_params</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json_object</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span><span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">JSON</span><span class="p">)</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">json_schema</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json_schema</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">):</span>
<span class="n">json_schema</span> <span class="o">=</span> <span class="n">json_schema</span><span class="o">.</span><span class="n">model_json_schema</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json_schema</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="n">json_schema</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">json_schema</span><span class="p">)</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">JSON_SCHEMA</span><span class="p">,</span> <span class="n">json_schema</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">regex</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">REGEX</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">regex</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">grammar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">EBNF_GRAMMAR</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">grammar</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">structural_tag</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">STRUCTURAL_TAG</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">structural_tag</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span></div>
</pre></div>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
</div>
</footer>
</div>
<div class="bd-sidebar-secondary"></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script defer src="../../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="../../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<a class="footer-brand logo" href="https://www.nvidia.com">
<img src="../../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
<img src="../../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
</a></div>
<div class="footer-item">
<div class="footer-links">
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
|
<a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
|
<a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
|
<a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
</div>
</div>
<div class="footer-item">
<p class="copyright">
Copyright © 2025, NVidia.
<br/>
</p>
</div>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on September 29, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/560ded5">560ded5</a>.</p>
</div></div>
</div>
</div>
</footer>
</body>
</html>