TensorRT-LLMs/llm-api/reference.html
2024-11-05 14:01:36 +08:00

1454 lines
248 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>API Reference &mdash; tensorrt_llm documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=19f00094" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<!--[if lt IE 9]>
<script src="../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="LLM Examples Introduction" href="../llm-api-examples/index.html" />
<link rel="prev" title="API Introduction" href="index.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
tensorrt_llm
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="index.html">API Introduction</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">API Reference</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM"><code class="docutils literal notranslate"><span class="pre">LLM</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM.__init__"><code class="docutils literal notranslate"><span class="pre">LLM.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM.generate"><code class="docutils literal notranslate"><span class="pre">LLM.generate()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM.generate_async"><code class="docutils literal notranslate"><span class="pre">LLM.generate_async()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM.save"><code class="docutils literal notranslate"><span class="pre">LLM.save()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM.tokenizer"><code class="docutils literal notranslate"><span class="pre">LLM.tokenizer</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.LLM.workspace"><code class="docutils literal notranslate"><span class="pre">LLM.workspace</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput"><code class="docutils literal notranslate"><span class="pre">RequestOutput</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput.__init__"><code class="docutils literal notranslate"><span class="pre">RequestOutput.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput.handle_response"><code class="docutils literal notranslate"><span class="pre">RequestOutput.handle_response()</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams"><code class="docutils literal notranslate"><span class="pre">SamplingParams</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.__init__"><code class="docutils literal notranslate"><span class="pre">SamplingParams.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.add_special_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.add_special_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.bad"><code class="docutils literal notranslate"><span class="pre">SamplingParams.bad</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.bad_token_ids"><code class="docutils literal notranslate"><span class="pre">SamplingParams.bad_token_ids</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.beam_search_diversity_rate"><code class="docutils literal notranslate"><span class="pre">SamplingParams.beam_search_diversity_rate</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.beam_width"><code class="docutils literal notranslate"><span class="pre">SamplingParams.beam_width</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.early_stopping"><code class="docutils literal notranslate"><span class="pre">SamplingParams.early_stopping</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.embedding_bias"><code class="docutils literal notranslate"><span class="pre">SamplingParams.embedding_bias</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.end_id"><code class="docutils literal notranslate"><span class="pre">SamplingParams.end_id</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.exclude_input_from_output"><code class="docutils literal notranslate"><span class="pre">SamplingParams.exclude_input_from_output</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.external_draft_tokens_config"><code class="docutils literal notranslate"><span class="pre">SamplingParams.external_draft_tokens_config</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.frequency_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.frequency_penalty</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.include_stop_str_in_output"><code class="docutils literal notranslate"><span class="pre">SamplingParams.include_stop_str_in_output</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.length_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.length_penalty</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.logits_post_processor_name"><code class="docutils literal notranslate"><span class="pre">SamplingParams.logits_post_processor_name</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.max_new_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.max_new_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.max_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.max_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.min_length"><code class="docutils literal notranslate"><span class="pre">SamplingParams.min_length</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.min_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.min_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.no_repeat_ngram_size"><code class="docutils literal notranslate"><span class="pre">SamplingParams.no_repeat_ngram_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.pad_id"><code class="docutils literal notranslate"><span class="pre">SamplingParams.pad_id</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.presence_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.presence_penalty</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.prompt_tuning_config"><code class="docutils literal notranslate"><span class="pre">SamplingParams.prompt_tuning_config</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.random_seed"><code class="docutils literal notranslate"><span class="pre">SamplingParams.random_seed</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.repetition_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.repetition_penalty</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.return_context_logits"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_context_logits</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.return_encoder_output"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_encoder_output</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.return_generation_logits"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_generation_logits</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.return_log_probs"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_log_probs</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.seed"><code class="docutils literal notranslate"><span class="pre">SamplingParams.seed</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.setup"><code class="docutils literal notranslate"><span class="pre">SamplingParams.setup()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.stop"><code class="docutils literal notranslate"><span class="pre">SamplingParams.stop</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.stop_token_ids"><code class="docutils literal notranslate"><span class="pre">SamplingParams.stop_token_ids</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.temperature"><code class="docutils literal notranslate"><span class="pre">SamplingParams.temperature</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.top_k"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_k</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.top_p"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.top_p_decay"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p_decay</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.top_p_min"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p_min</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams.top_p_reset_ids"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p_reset_ids</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.__init__"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.cross_kv_cache_fraction"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.cross_kv_cache_fraction</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.enable_block_reuse"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.enable_block_reuse</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.free_gpu_memory_fraction"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.free_gpu_memory_fraction</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.host_cache_size"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.host_cache_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.max_attention_window"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.max_attention_window</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.max_tokens"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.max_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.onboard_blocks"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.onboard_blocks</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig.sink_token_length"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.sink_token_length</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.SchedulerConfig"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SchedulerConfig.__init__"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SchedulerConfig.capacity_scheduler_policy"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.capacity_scheduler_policy</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.SchedulerConfig.context_chunking_policy"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.context_chunking_policy</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.GUARANTEED_NO_EVICT</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.MAX_UTILIZATION"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.MAX_UTILIZATION</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.STATIC_BATCH"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.STATIC_BATCH</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.__init__"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.name"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.name</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.value"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.value</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig"><code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.__init__"><code class="docutils literal notranslate"><span class="pre">BuildConfig.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.auto_parallel_config"><code class="docutils literal notranslate"><span class="pre">BuildConfig.auto_parallel_config</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.dry_run"><code class="docutils literal notranslate"><span class="pre">BuildConfig.dry_run</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.enable_debug_output"><code class="docutils literal notranslate"><span class="pre">BuildConfig.enable_debug_output</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.force_num_profiles"><code class="docutils literal notranslate"><span class="pre">BuildConfig.force_num_profiles</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">BuildConfig.from_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.from_json_file"><code class="docutils literal notranslate"><span class="pre">BuildConfig.from_json_file()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">BuildConfig.gather_context_logits</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">BuildConfig.gather_generation_logits</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.input_timing_cache"><code class="docutils literal notranslate"><span class="pre">BuildConfig.input_timing_cache</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.kv_cache_type"><code class="docutils literal notranslate"><span class="pre">BuildConfig.kv_cache_type</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.lora_config"><code class="docutils literal notranslate"><span class="pre">BuildConfig.lora_config</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_batch_size"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_batch_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_beam_width"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_beam_width</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_draft_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_draft_len</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_encoder_input_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_encoder_input_len</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_input_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_input_len</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_num_tokens"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_num_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_prompt_embedding_table_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.max_seq_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_seq_len</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.opt_batch_size"><code class="docutils literal notranslate"><span class="pre">BuildConfig.opt_batch_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.opt_num_tokens"><code class="docutils literal notranslate"><span class="pre">BuildConfig.opt_num_tokens</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.output_timing_cache"><code class="docutils literal notranslate"><span class="pre">BuildConfig.output_timing_cache</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.plugin_config"><code class="docutils literal notranslate"><span class="pre">BuildConfig.plugin_config</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.profiling_verbosity"><code class="docutils literal notranslate"><span class="pre">BuildConfig.profiling_verbosity</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.speculative_decoding_mode"><code class="docutils literal notranslate"><span class="pre">BuildConfig.speculative_decoding_mode</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.strongly_typed"><code class="docutils literal notranslate"><span class="pre">BuildConfig.strongly_typed</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">BuildConfig.to_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.update"><code class="docutils literal notranslate"><span class="pre">BuildConfig.update()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.update_from_dict"><code class="docutils literal notranslate"><span class="pre">BuildConfig.update_from_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.update_kv_cache_type"><code class="docutils literal notranslate"><span class="pre">BuildConfig.update_kv_cache_type()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.use_fused_mlp"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_fused_mlp</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.use_refit"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_refit</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.use_strip_plan"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_strip_plan</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.visualize_network"><code class="docutils literal notranslate"><span class="pre">BuildConfig.visualize_network</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.weight_sparsity"><code class="docutils literal notranslate"><span class="pre">BuildConfig.weight_sparsity</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig.weight_streaming"><code class="docutils literal notranslate"><span class="pre">BuildConfig.weight_streaming</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig"><code class="docutils literal notranslate"><span class="pre">QuantConfig</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.__init__"><code class="docutils literal notranslate"><span class="pre">QuantConfig.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.clamp_val"><code class="docutils literal notranslate"><span class="pre">QuantConfig.clamp_val</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.exclude_modules"><code class="docutils literal notranslate"><span class="pre">QuantConfig.exclude_modules</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">QuantConfig.from_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.get_modelopt_kv_cache_dtype"><code class="docutils literal notranslate"><span class="pre">QuantConfig.get_modelopt_kv_cache_dtype()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.get_modelopt_qformat"><code class="docutils literal notranslate"><span class="pre">QuantConfig.get_modelopt_qformat()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.get_quant_cfg"><code class="docutils literal notranslate"><span class="pre">QuantConfig.get_quant_cfg()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.group_size"><code class="docutils literal notranslate"><span class="pre">QuantConfig.group_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.has_zero_point"><code class="docutils literal notranslate"><span class="pre">QuantConfig.has_zero_point</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.kv_cache_quant_algo"><code class="docutils literal notranslate"><span class="pre">QuantConfig.kv_cache_quant_algo</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.layer_quant_mode"><code class="docutils literal notranslate"><span class="pre">QuantConfig.layer_quant_mode</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.pre_quant_scale"><code class="docutils literal notranslate"><span class="pre">QuantConfig.pre_quant_scale</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.quant_algo"><code class="docutils literal notranslate"><span class="pre">QuantConfig.quant_algo</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.quant_mode"><code class="docutils literal notranslate"><span class="pre">QuantConfig.quant_mode</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.requires_calibration"><code class="docutils literal notranslate"><span class="pre">QuantConfig.requires_calibration</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.requires_modelopt_quantization"><code class="docutils literal notranslate"><span class="pre">QuantConfig.requires_modelopt_quantization</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.smoothquant_val"><code class="docutils literal notranslate"><span class="pre">QuantConfig.smoothquant_val</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">QuantConfig.to_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig.use_plugin_sq"><code class="docutils literal notranslate"><span class="pre">QuantConfig.use_plugin_sq</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo"><code class="docutils literal notranslate"><span class="pre">QuantAlgo</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.FP8"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.FP8</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.INT8"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.INT8</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.MIXED_PRECISION"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.MIXED_PRECISION</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.NO_QUANT"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.NO_QUANT</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W4A16"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A16</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W4A16_AWQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A16_AWQ</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W4A16_GPTQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A16_GPTQ</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W4A8_AWQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A8_AWQ</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W8A16"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A16</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_CHANNEL</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig"><code class="docutils literal notranslate"><span class="pre">CalibConfig</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.__init__"><code class="docutils literal notranslate"><span class="pre">CalibConfig.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.calib_batch_size"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_batch_size</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.calib_batches"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_batches</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.calib_dataset"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_dataset</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.calib_max_seq_length"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_max_seq_length</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.device"><code class="docutils literal notranslate"><span class="pre">CalibConfig.device</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">CalibConfig.from_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.random_seed"><code class="docutils literal notranslate"><span class="pre">CalibConfig.random_seed</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">CalibConfig.to_dict()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig.tokenizer_max_seq_length"><code class="docutils literal notranslate"><span class="pre">CalibConfig.tokenizer_max_seq_length</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildCacheConfig"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig</span></code></a><ul>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildCacheConfig.cache_root"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.cache_root</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildCacheConfig.max_records"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_records</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildCacheConfig.max_cache_storage_gb"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_cache_storage_gb</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.hlapi.BuildCacheConfig.__init__"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.__init__()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#id0"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.cache_root</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#id1"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_cache_storage_gb</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#id2"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_records</span></code></a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.hlapi.RequestError"><code class="docutils literal notranslate"><span class="pre">RequestError</span></code></a></li>
</ul>
</li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/batch-manager.html">The Batch Manager in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html#lookahead-decoding">Lookahead decoding</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-best-practices.html">Best Practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">tensorrt_llm</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">API Reference</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/llm-api/reference.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="api-reference">
<h1>API Reference<a class="headerlink" href="#api-reference" title="Link to this heading"></a></h1>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">LLM</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">PreTrainedTokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">trust_remote_code</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Any</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#LLM"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.LLM" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>LLM class is the main class for running a LLM model.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>model</strong> (<em>str</em>) The model name or a local path to the model directory. It could be a HuggingFace(HF) model name,
a local path to the HF model, or a local path to the TRT-LLM engine or checkpoint.</p></li>
<li><p><strong>tokenizer</strong> (<em>Optional</em><em>[</em><em>Union</em><em>[</em><em>str</em><em>, </em><em>Path</em><em>, </em><em>TokenizerBase</em><em>, </em><em>PreTrainedTokenizerBase</em><em>]</em><em>]</em>) The tokenizer name or a local
path to the tokenizer directory.</p></li>
<li><p><strong>skip_tokenizer_init</strong> If true, skip initialization of tokenizer and detokenizer. generate and generate_async
will accept prompt token ids as input only.</p></li>
<li><p><strong>tensor_parallel_size</strong> (<em>int</em>) The number of processes for tensor parallelism.</p></li>
<li><p><strong>dtype</strong> (<em>str</em>) The data type for the model weights and activations.</p></li>
<li><p><strong>trust_remote_code</strong> (<em>bool</em><em>, </em><em>default=False</em>) Download the model and tokenizer from trust remote code (e.g, Hugging Face)</p></li>
<li><p><strong>revision</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) The revision of the model.</p></li>
<li><p><strong>tokenzier_revision</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) The revision of the tokenizer.</p></li>
<li><p><strong>auto_parallel</strong> (<em>bool</em><em>, </em><em>default=False</em>) Enable auto parallel mode.</p></li>
<li><p><strong>pipeline_parallel_size</strong> (<em>int</em><em>, </em><em>default=1</em>) The pipeline parallel size.</p></li>
<li><p><strong>enable_lora</strong> (<em>bool</em><em>, </em><em>default=False</em>) Enable LoRA adapters.</p></li>
<li><p><strong>max_lora_rank</strong> (<em>int</em><em>, </em><em>default=None</em>) Maximum LoRA rank. If specified, it overrides <cite>build_config.lora_config.max_lora_rank</cite>.</p></li>
<li><p><strong>max_loras</strong> (<em>int</em><em>, </em><em>default=4</em>) Maximum number of LoRA adapters to be stored in GPU memory.</p></li>
<li><p><strong>max_cpu_loras</strong> (<em>int</em><em>, </em><em>default=4</em>) Maximum number of LoRA adapters to be stored in CPU memory.</p></li>
<li><p><strong>build_config</strong> (<a class="reference internal" href="#tensorrt_llm.hlapi.BuildConfig" title="tensorrt_llm.hlapi.BuildConfig"><em>BuildConfig</em></a><em>, </em><em>default=BuildConfig</em><em>(</em><em>)</em>) The build configuration for the model.
Default is an empty BuildConfig instance.</p></li>
<li><p><strong>quant_config</strong> (<a class="reference internal" href="#tensorrt_llm.hlapi.QuantConfig" title="tensorrt_llm.hlapi.QuantConfig"><em>QuantConfig</em></a><em>, </em><em>default=QuantConfig</em><em>(</em><em>)</em>) The quantization configuration for the model.
Default is an empty QuantConfig instance.</p></li>
<li><p><strong>calib_config</strong> (<a class="reference internal" href="#tensorrt_llm.hlapi.CalibConfig" title="tensorrt_llm.hlapi.CalibConfig"><em>CalibConfig</em></a><em>, </em><em>default=CalibConfig</em><em>(</em><em>)</em>) The calibration configuration for the model.</p></li>
<li><p><strong>embedding_parallel_mode</strong> (<em>str</em><em>, </em><em>default=&quot;SHARDING_ALONG_VOCAB&quot;</em>) The parallel mode for embeddings.</p></li>
<li><p><strong>share_embedding_table</strong> (<em>bool</em><em>, </em><em>default=False</em>) Whether to share the embedding table.</p></li>
<li><p><strong>kv_cache_config</strong> (<a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig" title="tensorrt_llm.hlapi.KvCacheConfig"><em>KvCacheConfig</em></a><em>, </em><em>optional</em>) The key-value cache configuration for the model.
Default is None.</p></li>
<li><p><strong>peft_cache_config</strong> (<em>PeftCacheConfig</em><em>, </em><em>optional</em>) The PEFT cache configuration for the model.
Default is None.</p></li>
<li><p><strong>decoding_config</strong> (<em>DecodingConfig</em><em>, </em><em>optional</em>) The decoding configuration for the model.
Default is None.</p></li>
<li><p><strong>logits_post_processor_map</strong> (<em>Dict</em><em>[</em><em>str</em><em>, </em><em>Callable</em><em>]</em><em>, </em><em>optional</em>) A map of logit post-processing functions.
Default is None.</p></li>
<li><p><strong>scheduler_config</strong> (<a class="reference internal" href="#tensorrt_llm.hlapi.SchedulerConfig" title="tensorrt_llm.hlapi.SchedulerConfig"><em>SchedulerConfig</em></a><em>, </em><em>default=SchedulerConfig</em><em>(</em><em>)</em>) The scheduler configuration for the model.
Default is an empty SchedulerConfig instance.</p></li>
<li><p><strong>normalize_log_probs</strong> (<em>bool</em><em>, </em><em>default=False</em>) Whether to normalize log probabilities for the model.</p></li>
<li><p><strong>iter_stats_max_iterations</strong> (<em>int</em><em>, </em><em>optional</em>) The maximum number of iterations for iteration statistics.
Default is None.</p></li>
<li><p><strong>request_stats_max_iterations</strong> (<em>int</em><em>, </em><em>optional</em>) The maximum number of iterations for request statistics.
Default is None.</p></li>
<li><p><strong>batching_type</strong> (<em>BatchingType</em><em>, </em><em>optional</em>) The batching type for the model.
Default is None.</p></li>
<li><p><strong>enable_build_cache</strong> (<em>bool</em><em> or </em><a class="reference internal" href="#tensorrt_llm.hlapi.BuildCacheConfig" title="tensorrt_llm.hlapi.BuildCacheConfig"><em>BuildCacheConfig</em></a><em>, </em><em>optional</em>) Whether to enable build caching for the model.
Default is None.</p></li>
<li><p><strong>enable_tqdm</strong> (<em>bool</em><em>, </em><em>default=False</em>) Whether to display a progress bar during model building.</p></li>
<li><p><strong>trust_remote_code</strong> Whether to trust remote code when downloading model and tokenizer from Hugging Face.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">PreTrainedTokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">trust_remote_code</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Any</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#LLM.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.LLM.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sampling_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.utils.SamplingParams"><span class="pre">SamplingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.utils.SamplingParams"><span class="pre">SamplingParams</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_tqdm</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoRARequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">LoRARequest</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput" title="tensorrt_llm.hlapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput" title="tensorrt_llm.hlapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#LLM.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.LLM.generate" title="Link to this definition"></a></dt>
<dd><p>Generate output for the given prompts in the synchronous mode.
Synchronous generation accepts either single prompt or batched prompts.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>inputs</strong> (<em>Union</em><em>[</em><em>PromptInputs</em><em>, </em><em>Sequence</em><em>[</em><em>PromptInputs</em><em>]</em><em>]</em>) The prompt text or token ids.
Note, it must be single prompt or batched prompts.</p></li>
<li><p><strong>sampling_params</strong> (<em>Optional</em><em>[</em><em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.SamplingParams"><em>SamplingParams</em></a><em>, </em><em>List</em><em>[</em><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.SamplingParams"><em>SamplingParams</em></a><em>]</em><em>]</em><em>]</em>) The sampling params for the
generation, a default one will be used if not provided.</p></li>
<li><p><strong>use_tqdm</strong> (<em>bool</em>) Whether to use tqdm to display the progress bar.</p></li>
<li><p><strong>lora_request</strong> (<em>Optional</em><em>[</em><em>Union</em><em>[</em><em>LoRARequest</em><em>, </em><em>Sequence</em><em>[</em><em>LoRARequest</em><em>]</em><em>]</em><em>]</em>) LoRA request to use for generation, if any.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>The output data of the completion request to the LLM.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>Union[<a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput" title="tensorrt_llm.hlapi.RequestOutput">RequestOutput</a>, List[<a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput" title="tensorrt_llm.hlapi.RequestOutput">RequestOutput</a>]]</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM.generate_async">
<span class="sig-name descname"><span class="pre">generate_async</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sampling_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.utils.SamplingParams"><span class="pre">SamplingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoRARequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput" title="tensorrt_llm.hlapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#LLM.generate_async"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.LLM.generate_async" title="Link to this definition"></a></dt>
<dd><p>Generate output for the given prompt in the asynchronous mode.
Asynchronous generation accepts single prompt only.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>inputs</strong> (<em>PromptInputs</em>) The prompt text or token ids; must be single prompt.</p></li>
<li><p><strong>sampling_params</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.SamplingParams"><em>SamplingParams</em></a><em>]</em>) The sampling params for the generation, a default one will be
used if not provided.</p></li>
<li><p><strong>lora_request</strong> (<em>Optional</em><em>[</em><em>LoRARequest</em><em>]</em>) LoRA request to use for generation, if any.</p></li>
<li><p><strong>streaming</strong> (<em>bool</em>) Whether to use the streaming mode for the generation.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>The output data of the completion request to the LLM.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.hlapi.RequestOutput" title="tensorrt_llm.hlapi.RequestOutput">RequestOutput</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM.save">
<span class="sig-name descname"><span class="pre">save</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#LLM.save"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.LLM.save" title="Link to this definition"></a></dt>
<dd><p>Save the built engine to the given path.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>engine_dir</strong> (<em>str</em>) The path to save the engine.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>None</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM.tokenizer">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">tokenizer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.LLM.tokenizer" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.LLM.workspace">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">workspace</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Path</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.LLM.workspace" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.RequestOutput">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">RequestOutput</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">generation_result</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">GenerationResult</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#RequestOutput"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.RequestOutput" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">GenerationResult</span></code></p>
<p>The output data of a completion request to the LLM.</p>
<dl class="simple">
<dt>Fields:</dt><dd><p>request_id (int): The unique ID of the request.
prompt (str): The prompt string of the request.
prompt_token_ids (List[int]): The token ids of the prompt.
outputs (List[CompletionOutput]): The output sequences of the request.
context_logits (torch.Tensor): The logits on the prompt token ids.
finished (bool): Whether the whole request is finished.</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.RequestOutput.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">generation_result</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">GenerationResult</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#RequestOutput.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.RequestOutput.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.RequestOutput.handle_response">
<span class="sig-name descname"><span class="pre">handle_response</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">response</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm.html#RequestOutput.handle_response"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.RequestOutput.handle_response" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">SamplingParams</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">32</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_stop_str_in_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">embedding_bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">external_draft_tokens_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ExternalDraftTokensConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt_tuning_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PromptTuningConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logits_post_processor_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_search_diversity_rate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">no_repeat_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_context_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_input_from_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/utils.html#SamplingParams"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Sampling parameters for text generation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>end_id</strong> (<em>int</em>) The end token id.</p></li>
<li><p><strong>pad_id</strong> (<em>int</em>) The pad token id.</p></li>
<li><p><strong>max_tokens</strong> (<em>int</em>) The maximum number of tokens to generate.</p></li>
<li><p><strong>max_new_tokens</strong> (<em>int</em>) The maximum number of tokens to generate. This argument is being deprecated; please use max_tokens instead.</p></li>
<li><p><strong>bad</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>List</em><em>[</em><em>str</em><em>]</em><em>]</em>) A string or a list of strings that redirect the generation when they are generated, so that the bad strings are excluded from the returned output.</p></li>
<li><p><strong>bad_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em>) A list of token ids that redirect the generation when they are generated, so that the bad ids are excluded from the returned output.</p></li>
<li><p><strong>stop</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>List</em><em>[</em><em>str</em><em>]</em><em>]</em>) A string or a list of strings that stop the generation when they are generated. The returned output will not contain the stop strings unless include_stop_str_in_output is True.</p></li>
<li><p><strong>stop_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em>) A list of token ids that stop the generation when they are generated.</p></li>
<li><p><strong>include_stop_str_in_output</strong> (<em>bool</em>) Whether to include the stop strings in output text. Defaults to False.</p></li>
<li><p><strong>embedding_bias</strong> (<em>torch.Tensor</em>) The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size].</p></li>
<li><p><strong>external_draft_tokens_config</strong> (<em>ExternalDraftTokensConfig</em>) The speculative decoding configuration.</p></li>
<li><p><strong>prompt_tuning_config</strong> (<em>PromptTuningConfig</em>) The prompt tuning configuration.</p></li>
<li><p><strong>logits_post_processor_name</strong> (<em>str</em>) The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig.</p></li>
<li><p><strong>beam_width</strong> (<em>int</em>) The beam width. Default is 1 which disables beam search.</p></li>
<li><p><strong>top_k</strong> (<em>int</em>) Controls number of logits to sample from. Default is 0 (all logits).</p></li>
<li><p><strong>top_p</strong> (<em>float</em>) Controls the top-P probability to sample from. Default is 0.f</p></li>
<li><p><strong>top_p_min</strong> (<em>float</em>) Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.</p></li>
<li><p><strong>top_p_reset_ids</strong> (<em>int</em>) Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.</p></li>
<li><p><strong>top_p_decay</strong> (<em>float</em>) Controls decay in the top-P algorithm. The decay value. Default is 1.f</p></li>
<li><p><strong>seed</strong> (<em>int</em>) Controls the random seed used by the random number generator in sampling</p></li>
<li><p><strong>random_seed</strong> (<em>int</em>) Controls the random seed used by the random number generator in sampling. This argument is being deprecated; please use seed instead.</p></li>
<li><p><strong>temperature</strong> (<em>float</em>) Controls the modulation of logits when sampling new tokens. It can have values &gt; 0.f. Default is 1.0f</p></li>
<li><p><strong>min_tokens</strong> (<em>int</em>) Lower bound on the number of tokens to generate. Values &lt; 1 have no effect. Default is 1.</p></li>
<li><p><strong>min_length</strong> (<em>int</em>) Lower bound on the number of tokens to generate. Values &lt; 1 have no effect. Default is 1. This argument is being deprecated; please use min_tokens instead.</p></li>
<li><p><strong>beam_search_diversity_rate</strong> (<em>float</em>) Controls the diversity in beam search.</p></li>
<li><p><strong>repetition_penalty</strong> (<em>float</em>) Used to penalize tokens based on how often they appear in the sequence. It can have any value &gt; 0.f. Values &lt; 1.f encourages repetition, values &gt; 1.f discourages it. Default is 1.f</p></li>
<li><p><strong>presence_penalty</strong> (<em>float</em>) Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values &lt; 0.f encourage repetition, values &gt; 0.f discourage it. Default is 0.f</p></li>
<li><p><strong>frequency_penalty</strong> (<em>float</em>) Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values &lt; 0.f encourage repetition, values &gt; 0.f discourage it. Default is 0.f</p></li>
<li><p><strong>length_penalty</strong> (<em>float</em>) Controls how to penalize longer sequences in beam search. Default is 0.f</p></li>
<li><p><strong>early_stopping</strong> (<em>int</em>) Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)</p></li>
<li><p><strong>no_repeat_ngram_size</strong> (<em>int</em>) Controls how many repeat ngram size are acceptable. Default is 1 &lt;&lt; 30.</p></li>
<li><p><strong>return_log_probs</strong> (<em>bool</em>) Controls if Result should contain log probabilities. Default is false.</p></li>
<li><p><strong>return_context_logits</strong> (<em>bool</em>) Controls if Result should contain the context logits. Default is false.</p></li>
<li><p><strong>return_generation_logits</strong> (<em>bool</em>) Controls if Result should contain the generation logits. Default is false.</p></li>
<li><p><strong>exclude_input_from_output</strong> (<em>bool</em>) Controls if output tokens in Result should include the input tokens. Default is true.</p></li>
<li><p><strong>return_encoder_output</strong> (<em>bool</em>) Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false.</p></li>
<li><p><strong>add_special_tokens</strong> (<em>bool</em>) Whether to add special tokens to the prompt.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">32</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_stop_str_in_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">embedding_bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">external_draft_tokens_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ExternalDraftTokensConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt_tuning_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PromptTuningConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logits_post_processor_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_search_diversity_rate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">no_repeat_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_context_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_input_from_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.add_special_tokens">
<span class="sig-name descname"><span class="pre">add_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.add_special_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.bad">
<span class="sig-name descname"><span class="pre">bad</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.bad" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.bad_token_ids">
<span class="sig-name descname"><span class="pre">bad_token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.bad_token_ids" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.beam_search_diversity_rate">
<span class="sig-name descname"><span class="pre">beam_search_diversity_rate</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.beam_search_diversity_rate" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.beam_width">
<span class="sig-name descname"><span class="pre">beam_width</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.beam_width" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.early_stopping">
<span class="sig-name descname"><span class="pre">early_stopping</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.early_stopping" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.embedding_bias">
<span class="sig-name descname"><span class="pre">embedding_bias</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.embedding_bias" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.end_id">
<span class="sig-name descname"><span class="pre">end_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.end_id" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.exclude_input_from_output">
<span class="sig-name descname"><span class="pre">exclude_input_from_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.exclude_input_from_output" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.external_draft_tokens_config">
<span class="sig-name descname"><span class="pre">external_draft_tokens_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ExternalDraftTokensConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.external_draft_tokens_config" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.frequency_penalty">
<span class="sig-name descname"><span class="pre">frequency_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.frequency_penalty" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.include_stop_str_in_output">
<span class="sig-name descname"><span class="pre">include_stop_str_in_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.include_stop_str_in_output" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.length_penalty">
<span class="sig-name descname"><span class="pre">length_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.length_penalty" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.logits_post_processor_name">
<span class="sig-name descname"><span class="pre">logits_post_processor_name</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.logits_post_processor_name" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.max_new_tokens">
<span class="sig-name descname"><span class="pre">max_new_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.max_new_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.max_tokens">
<span class="sig-name descname"><span class="pre">max_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.max_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.min_length">
<span class="sig-name descname"><span class="pre">min_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.min_length" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.min_tokens">
<span class="sig-name descname"><span class="pre">min_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.min_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.no_repeat_ngram_size">
<span class="sig-name descname"><span class="pre">no_repeat_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.no_repeat_ngram_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.pad_id">
<span class="sig-name descname"><span class="pre">pad_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.pad_id" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.presence_penalty">
<span class="sig-name descname"><span class="pre">presence_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.presence_penalty" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.prompt_tuning_config">
<span class="sig-name descname"><span class="pre">prompt_tuning_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">PromptTuningConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.prompt_tuning_config" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.random_seed">
<span class="sig-name descname"><span class="pre">random_seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.random_seed" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.repetition_penalty">
<span class="sig-name descname"><span class="pre">repetition_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.repetition_penalty" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.return_context_logits">
<span class="sig-name descname"><span class="pre">return_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.return_context_logits" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.return_encoder_output">
<span class="sig-name descname"><span class="pre">return_encoder_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.return_encoder_output" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.return_generation_logits">
<span class="sig-name descname"><span class="pre">return_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.return_generation_logits" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.return_log_probs">
<span class="sig-name descname"><span class="pre">return_log_probs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.return_log_probs" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.seed">
<span class="sig-name descname"><span class="pre">seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.seed" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.setup">
<span class="sig-name descname"><span class="pre">setup</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.hlapi.SamplingParams" title="tensorrt_llm.hlapi.utils.SamplingParams"><span class="pre">SamplingParams</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/utils.html#SamplingParams.setup"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.setup" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.stop">
<span class="sig-name descname"><span class="pre">stop</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.stop" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.stop_token_ids">
<span class="sig-name descname"><span class="pre">stop_token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.stop_token_ids" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.temperature">
<span class="sig-name descname"><span class="pre">temperature</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.temperature" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.top_k">
<span class="sig-name descname"><span class="pre">top_k</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.top_k" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.top_p">
<span class="sig-name descname"><span class="pre">top_p</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.top_p" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.top_p_decay">
<span class="sig-name descname"><span class="pre">top_p_decay</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.top_p_decay" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.top_p_min">
<span class="sig-name descname"><span class="pre">top_p_min</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.top_p_min" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SamplingParams.top_p_reset_ids">
<span class="sig-name descname"><span class="pre">top_p_reset_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.SamplingParams.top_p_reset_ids" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">KvCacheConfig</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.KvCacheConfig" title="tensorrt_llm.bindings.executor.KvCacheConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheConfig</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">enable_block_reuse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_attention_window</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sink_token_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">free_gpu_memory_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">host_cache_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">onboard_blocks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cross_kv_cache_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.cross_kv_cache_fraction">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cross_kv_cache_fraction</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.cross_kv_cache_fraction" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.enable_block_reuse">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_block_reuse</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.enable_block_reuse" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.free_gpu_memory_fraction">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">free_gpu_memory_fraction</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.free_gpu_memory_fraction" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.host_cache_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">host_cache_size</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.host_cache_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.max_attention_window">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_attention_window</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.max_attention_window" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.max_tokens">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_tokens</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.max_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.onboard_blocks">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">onboard_blocks</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.onboard_blocks" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.KvCacheConfig.sink_token_length">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">sink_token_length</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.KvCacheConfig.sink_token_length" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SchedulerConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">SchedulerConfig</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.SchedulerConfig" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SchedulerConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#tensorrt_llm.hlapi.SchedulerConfig.__init__" title="Link to this definition"></a></dt>
<dd><p>Overloaded function.</p>
<ol class="arabic simple">
<li><p>__init__(self: tensorrt_llm.bindings.executor.SchedulerConfig, capacity_scheduler_policy: tensorrt_llm.bindings.executor.CapacitySchedulerPolicy = CapacitySchedulerPolicy.GUARANTEED_NO_EVICT) -&gt; None</p></li>
<li><p>__init__(self: tensorrt_llm.bindings.executor.SchedulerConfig, capacity_scheduler_policy: tensorrt_llm.bindings.executor.CapacitySchedulerPolicy, context_chunking_policy: Optional[tensorrt_llm.bindings.executor.ContextChunkingPolicy]) -&gt; None</p></li>
</ol>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SchedulerConfig.capacity_scheduler_policy">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">capacity_scheduler_policy</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.SchedulerConfig.capacity_scheduler_policy" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.SchedulerConfig.context_chunking_policy">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">context_chunking_policy</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.SchedulerConfig.context_chunking_policy" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">CapacitySchedulerPolicy</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
<p>Members:</p>
<p>MAX_UTILIZATION</p>
<p>GUARANTEED_NO_EVICT</p>
<p>STATIC_BATCH</p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT">
<span class="sig-name descname"><span class="pre">GUARANTEED_NO_EVICT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">&lt;CapacitySchedulerPolicy.GUARANTEED_NO_EVICT:</span> <span class="pre">1&gt;</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy.MAX_UTILIZATION">
<span class="sig-name descname"><span class="pre">MAX_UTILIZATION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">&lt;CapacitySchedulerPolicy.MAX_UTILIZATION:</span> <span class="pre">0&gt;</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.MAX_UTILIZATION" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy.STATIC_BATCH">
<span class="sig-name descname"><span class="pre">STATIC_BATCH</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">&lt;CapacitySchedulerPolicy.STATIC_BATCH:</span> <span class="pre">2&gt;</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.STATIC_BATCH" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy" title="tensorrt_llm.bindings.executor.CapacitySchedulerPolicy"><span class="pre">tensorrt_llm.bindings.executor.CapacitySchedulerPolicy</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy.name">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">name</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.name" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CapacitySchedulerPolicy.value">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">value</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.CapacitySchedulerPolicy.value" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">BuildConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">256</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_num_tokens:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">tensorrt_llm.bindings.KVCacheType</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strongly_typed:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">force_num_profiles:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">profiling_verbosity:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'layer_names_only'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">enable_debug_output:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_draft_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">speculative_decoding_mode:</span> <span class="pre">tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode</span> <span class="pre">=</span> <span class="pre">&lt;SpeculativeDecodingMode.NONE:</span> <span class="pre">1&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_refit:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auto_parallel_config:</span> <span class="pre">tensorrt_llm.auto_parallel.config.AutoParallelConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_sparsity:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_streaming:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config:</span> <span class="pre">tensorrt_llm.plugin.plugin.PluginConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_strip_plan:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_encoder_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_fused_mlp:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dry_run:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">visualize_network:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">256</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">~tensorrt_llm.bindings.KVCacheType</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strongly_typed:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">force_num_profiles:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">profiling_verbosity:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'layer_names_only'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">enable_debug_output:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_draft_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">speculative_decoding_mode:</span> <span class="pre">~tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode</span> <span class="pre">=</span> <span class="pre">&lt;SpeculativeDecodingMode.NONE:</span> <span class="pre">1&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_refit:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_timing_cache:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_timing_cache:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">~tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auto_parallel_config:</span> <span class="pre">~tensorrt_llm.auto_parallel.config.AutoParallelConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_sparsity:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_streaming:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config:</span> <span class="pre">~tensorrt_llm.plugin.plugin.PluginConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_strip_plan:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_encoder_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_fused_mlp:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dry_run:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">visualize_network:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.auto_parallel_config">
<span class="sig-name descname"><span class="pre">auto_parallel_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">AutoParallelConfig</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.auto_parallel_config" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.dry_run">
<span class="sig-name descname"><span class="pre">dry_run</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.dry_run" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.enable_debug_output">
<span class="sig-name descname"><span class="pre">enable_debug_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.enable_debug_output" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.force_num_profiles">
<span class="sig-name descname"><span class="pre">force_num_profiles</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.force_num_profiles" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.from_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.from_json_file">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_json_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.from_json_file"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.from_json_file" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.gather_context_logits">
<span class="sig-name descname"><span class="pre">gather_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.gather_context_logits" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.gather_generation_logits">
<span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.gather_generation_logits" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.input_timing_cache">
<span class="sig-name descname"><span class="pre">input_timing_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.input_timing_cache" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.kv_cache_type">
<span class="sig-name descname"><span class="pre">kv_cache_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">KVCacheType</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.kv_cache_type" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.lora_config">
<span class="sig-name descname"><span class="pre">lora_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">LoraConfig</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.lora_config" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_batch_size">
<span class="sig-name descname"><span class="pre">max_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">8</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_batch_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_beam_width">
<span class="sig-name descname"><span class="pre">max_beam_width</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_beam_width" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_draft_len">
<span class="sig-name descname"><span class="pre">max_draft_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_draft_len" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_encoder_input_len">
<span class="sig-name descname"><span class="pre">max_encoder_input_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_encoder_input_len" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_input_len">
<span class="sig-name descname"><span class="pre">max_input_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">256</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_input_len" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_num_tokens">
<span class="sig-name descname"><span class="pre">max_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_num_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_prompt_embedding_table_size">
<span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_prompt_embedding_table_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.max_seq_len">
<span class="sig-name descname"><span class="pre">max_seq_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">512</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.max_seq_len" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.opt_batch_size">
<span class="sig-name descname"><span class="pre">opt_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">8</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.opt_batch_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.opt_num_tokens">
<span class="sig-name descname"><span class="pre">opt_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.opt_num_tokens" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.output_timing_cache">
<span class="sig-name descname"><span class="pre">output_timing_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.output_timing_cache" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.plugin_config">
<span class="sig-name descname"><span class="pre">plugin_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html#tensorrt_llm.plugin.PluginConfig" title="tensorrt_llm.plugin.plugin.PluginConfig"><span class="pre">PluginConfig</span></a></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.plugin_config" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.profiling_verbosity">
<span class="sig-name descname"><span class="pre">profiling_verbosity</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'layer_names_only'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.profiling_verbosity" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.speculative_decoding_mode">
<span class="sig-name descname"><span class="pre">speculative_decoding_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.models.html#tensorrt_llm.models.SpeculativeDecodingMode" title="tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode"><span class="pre">SpeculativeDecodingMode</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.speculative_decoding_mode" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.strongly_typed">
<span class="sig-name descname"><span class="pre">strongly_typed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.strongly_typed" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.to_dict">
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.to_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.update">
<span class="sig-name descname"><span class="pre">update</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.update" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.update_from_dict">
<span class="sig-name descname"><span class="pre">update_from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update_from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.update_from_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.update_kv_cache_type">
<span class="sig-name descname"><span class="pre">update_kv_cache_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model_architecture</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update_kv_cache_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.update_kv_cache_type" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.use_fused_mlp">
<span class="sig-name descname"><span class="pre">use_fused_mlp</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.use_fused_mlp" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.use_refit">
<span class="sig-name descname"><span class="pre">use_refit</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.use_refit" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.use_strip_plan">
<span class="sig-name descname"><span class="pre">use_strip_plan</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.use_strip_plan" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.visualize_network">
<span class="sig-name descname"><span class="pre">visualize_network</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.visualize_network" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.weight_sparsity">
<span class="sig-name descname"><span class="pre">weight_sparsity</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.weight_sparsity" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildConfig.weight_streaming">
<span class="sig-name descname"><span class="pre">weight_streaming</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.BuildConfig.weight_streaming" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">QuantConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">128</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smoothquant_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">clamp_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">has_zero_point</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pre_quant_scale</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_modules</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Serializable quantization configuration class, part of the PretrainedConfig</p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">128</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smoothquant_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">clamp_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">has_zero_point</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pre_quant_scale</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_modules</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.clamp_val">
<span class="sig-name descname"><span class="pre">clamp_val</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.clamp_val" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.exclude_modules">
<span class="sig-name descname"><span class="pre">exclude_modules</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.exclude_modules" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.from_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.get_modelopt_kv_cache_dtype">
<span class="sig-name descname"><span class="pre">get_modelopt_kv_cache_dtype</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.get_modelopt_kv_cache_dtype"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.get_modelopt_kv_cache_dtype" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.get_modelopt_qformat">
<span class="sig-name descname"><span class="pre">get_modelopt_qformat</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.get_modelopt_qformat"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.get_modelopt_qformat" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.get_quant_cfg">
<span class="sig-name descname"><span class="pre">get_quant_cfg</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">module_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.get_quant_cfg"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.get_quant_cfg" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.group_size">
<span class="sig-name descname"><span class="pre">group_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">128</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.group_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.has_zero_point">
<span class="sig-name descname"><span class="pre">has_zero_point</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.has_zero_point" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.kv_cache_quant_algo">
<span class="sig-name descname"><span class="pre">kv_cache_quant_algo</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.kv_cache_quant_algo" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.layer_quant_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">layer_quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantMode" title="tensorrt_llm.quantization.mode.QuantMode"><span class="pre">QuantMode</span></a></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.layer_quant_mode" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.pre_quant_scale">
<span class="sig-name descname"><span class="pre">pre_quant_scale</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.pre_quant_scale" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.quant_algo">
<span class="sig-name descname"><span class="pre">quant_algo</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.hlapi.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.quant_algo" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.quant_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">QuantModeWrapper</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.quant_mode" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.requires_calibration">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">requires_calibration</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.requires_calibration" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.requires_modelopt_quantization">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">requires_modelopt_quantization</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.requires_modelopt_quantization" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.smoothquant_val">
<span class="sig-name descname"><span class="pre">smoothquant_val</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.5</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.smoothquant_val" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.to_dict">
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.to_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantConfig.use_plugin_sq">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_plugin_sq</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.QuantConfig.use_plugin_sq" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">QuantAlgo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">value</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/quantization/mode.html#QuantAlgo"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
<p>An enumeration.</p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.FP8">
<span class="sig-name descname"><span class="pre">FP8</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.FP8" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN">
<span class="sig-name descname"><span class="pre">FP8_PER_CHANNEL_PER_TOKEN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8_PER_CHANNEL_PER_TOKEN'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.INT8">
<span class="sig-name descname"><span class="pre">INT8</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'INT8'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.INT8" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.MIXED_PRECISION">
<span class="sig-name descname"><span class="pre">MIXED_PRECISION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'MIXED_PRECISION'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.MIXED_PRECISION" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.NO_QUANT">
<span class="sig-name descname"><span class="pre">NO_QUANT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'NO_QUANT'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.NO_QUANT" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W4A16">
<span class="sig-name descname"><span class="pre">W4A16</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W4A16" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W4A16_AWQ">
<span class="sig-name descname"><span class="pre">W4A16_AWQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16_AWQ'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W4A16_AWQ" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W4A16_GPTQ">
<span class="sig-name descname"><span class="pre">W4A16_GPTQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16_GPTQ'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W4A16_GPTQ" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W4A8_AWQ">
<span class="sig-name descname"><span class="pre">W4A8_AWQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_AWQ'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W4A8_AWQ" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W8A16">
<span class="sig-name descname"><span class="pre">W8A16</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A16'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W8A16" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_TENSOR_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_TENSOR_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">CalibConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cuda'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cnn_dailymail'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batches</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1234</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2048</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm_utils.html#CalibConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Calibration configuration.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>device</strong> (<em>Literal</em><em>[</em><em>'cuda'</em><em>, </em><em>'cpu'</em><em>]</em><em>, </em><em>default='cuda'</em>) The device to run calibration.</p></li>
<li><p><strong>calib_dataset</strong> (<em>str</em><em>, </em><em>default='cnn_dailymail'</em>) The name or local path of calibration dataset.</p></li>
<li><p><strong>calib_batches</strong> (<em>int</em><em>, </em><em>default=512</em>) The number of batches that the calibration runs.</p></li>
<li><p><strong>calib_batch_size</strong> (<em>int</em><em>, </em><em>default=1</em>) The batch size that the calibration runs.</p></li>
<li><p><strong>calib_max_seq_length</strong> (<em>int</em><em>, </em><em>default=512</em>) The maximum sequence length that the calibration runs.</p></li>
<li><p><strong>random_seed</strong> (<em>int</em><em>, </em><em>default=1234</em>) The random seed used for calibration.</p></li>
<li><p><strong>tokenizer_max_seq_length</strong> (<em>int</em><em>, </em><em>default=2048</em>) The maximum sequence length to initialize tokenizer for calibration.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cuda'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cnn_dailymail'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batches</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1234</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2048</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.calib_batch_size">
<span class="sig-name descname"><span class="pre">calib_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.calib_batch_size" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.calib_batches">
<span class="sig-name descname"><span class="pre">calib_batches</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.calib_batches" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.calib_dataset">
<span class="sig-name descname"><span class="pre">calib_dataset</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.calib_dataset" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.calib_max_seq_length">
<span class="sig-name descname"><span class="pre">calib_max_seq_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.calib_max_seq_length" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.device">
<span class="sig-name descname"><span class="pre">device</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.device" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm_utils.html#CalibConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.from_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.random_seed">
<span class="sig-name descname"><span class="pre">random_seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.random_seed" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.to_dict">
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/llm_utils.html#CalibConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.to_dict" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.CalibConfig.tokenizer_max_seq_length">
<span class="sig-name descname"><span class="pre">tokenizer_max_seq_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.hlapi.CalibConfig.tokenizer_max_seq_length" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildCacheConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">BuildCacheConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache_root</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_records</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_cache_storage_gb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">256</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/build_cache.html#BuildCacheConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildCacheConfig" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Configuration for the build cache.</p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildCacheConfig.cache_root">
<span class="sig-name descname"><span class="pre">cache_root</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.BuildCacheConfig.cache_root" title="Link to this definition"></a></dt>
<dd><p>The root directory for the build cache.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>str</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildCacheConfig.max_records">
<span class="sig-name descname"><span class="pre">max_records</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.BuildCacheConfig.max_records" title="Link to this definition"></a></dt>
<dd><p>The maximum number of records to store in the cache.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>int</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildCacheConfig.max_cache_storage_gb">
<span class="sig-name descname"><span class="pre">max_cache_storage_gb</span></span><a class="headerlink" href="#tensorrt_llm.hlapi.BuildCacheConfig.max_cache_storage_gb" title="Link to this definition"></a></dt>
<dd><p>The maximum amount of storage (in GB) to use for the cache.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>float</p>
</dd>
</dl>
</dd></dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The build-cache assumes the weights of the model are not changed during the execution. If the weights are
changed, you should remove the caches manually.</p>
</div>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.BuildCacheConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache_root</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_records</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_cache_storage_gb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">256</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/hlapi/build_cache.html#BuildCacheConfig.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.BuildCacheConfig.__init__" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id0">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cache_root</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Path</span></em><a class="headerlink" href="#id0" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id1">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_cache_storage_gb</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><a class="headerlink" href="#id1" title="Link to this definition"></a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id2">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_records</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#id2" title="Link to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.hlapi.RequestError">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.hlapi.</span></span><span class="sig-name descname"><span class="pre">RequestError</span></span><a class="reference internal" href="../_modules/tensorrt_llm/executor.html#RequestError"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.hlapi.RequestError" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">RuntimeError</span></code></p>
<p>The error raised when the request is failed.</p>
</dd></dl>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="index.html" class="btn btn-neutral float-left" title="API Introduction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="../llm-api-examples/index.html" class="btn btn-neutral float-right" title="LLM Examples Introduction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7fa9d7395780>
<div class="footer">
<p>
Copyright © 2024 NVIDIA Corporation
</p>
<p>
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
data-cms-ai="0">Privacy Policy</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
data-cms-ai="0">Manage My Privacy</a> |
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
data-cms-ai="0">Do Not Sell or Share My Data</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
rel="noopener" data-cms-ai="0">Terms of Service</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
data-cms-ai="0">Accessibility</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
data-cms-ai="0">Product Security</a> |
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
data-cms-ai="0">Contact</a>
</p>
</div>
</div>
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>