mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
1755 lines
321 KiB
HTML
1755 lines
321 KiB
HTML
|
||
|
||
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="../">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>API Reference — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
||
|
||
|
||
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../_static/copybutton.js?v=65e89d2a"></script>
|
||
<script src="../_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="LLM Examples Introduction" href="../llm-api-examples/index.html" />
|
||
<link rel="prev" title="API Introduction" href="index.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="../index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="index.html">API Introduction</a></li>
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">API Reference</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM"><code class="docutils literal notranslate"><span class="pre">LLM</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM.__init__"><code class="docutils literal notranslate"><span class="pre">LLM.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM.generate"><code class="docutils literal notranslate"><span class="pre">LLM.generate()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM.generate_async"><code class="docutils literal notranslate"><span class="pre">LLM.generate_async()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM.save"><code class="docutils literal notranslate"><span class="pre">LLM.save()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM.tokenizer"><code class="docutils literal notranslate"><span class="pre">LLM.tokenizer</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LLM.workspace"><code class="docutils literal notranslate"><span class="pre">LLM.workspace</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput"><code class="docutils literal notranslate"><span class="pre">RequestOutput</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput.__init__"><code class="docutils literal notranslate"><span class="pre">RequestOutput.__init__()</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams.__init__"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams.grammar"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams.grammar</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams.json</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json_object"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams.json_object</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams.num_guides"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams.num_guides</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams.regex"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams.regex</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams"><code class="docutils literal notranslate"><span class="pre">SamplingParams</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.__init__"><code class="docutils literal notranslate"><span class="pre">SamplingParams.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.add_special_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.add_special_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.additional_model_outputs"><code class="docutils literal notranslate"><span class="pre">SamplingParams.additional_model_outputs</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.bad"><code class="docutils literal notranslate"><span class="pre">SamplingParams.bad</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.bad_token_ids"><code class="docutils literal notranslate"><span class="pre">SamplingParams.bad_token_ids</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate"><code class="docutils literal notranslate"><span class="pre">SamplingParams.beam_search_diversity_rate</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.beam_width"><code class="docutils literal notranslate"><span class="pre">SamplingParams.beam_width</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.best_of"><code class="docutils literal notranslate"><span class="pre">SamplingParams.best_of</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.detokenize"><code class="docutils literal notranslate"><span class="pre">SamplingParams.detokenize</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.early_stopping"><code class="docutils literal notranslate"><span class="pre">SamplingParams.early_stopping</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.embedding_bias"><code class="docutils literal notranslate"><span class="pre">SamplingParams.embedding_bias</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.end_id"><code class="docutils literal notranslate"><span class="pre">SamplingParams.end_id</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output"><code class="docutils literal notranslate"><span class="pre">SamplingParams.exclude_input_from_output</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.external_draft_tokens_config"><code class="docutils literal notranslate"><span class="pre">SamplingParams.external_draft_tokens_config</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.frequency_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.frequency_penalty</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.greedy_decoding"><code class="docutils literal notranslate"><span class="pre">SamplingParams.greedy_decoding</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.guided_decoding"><code class="docutils literal notranslate"><span class="pre">SamplingParams.guided_decoding</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.ignore_eos"><code class="docutils literal notranslate"><span class="pre">SamplingParams.ignore_eos</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output"><code class="docutils literal notranslate"><span class="pre">SamplingParams.include_stop_str_in_output</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.length_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.length_penalty</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.logits_post_processor_name"><code class="docutils literal notranslate"><span class="pre">SamplingParams.logits_post_processor_name</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.lookahead_config"><code class="docutils literal notranslate"><span class="pre">SamplingParams.lookahead_config</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.max_new_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.max_new_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.max_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.max_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.min_length"><code class="docutils literal notranslate"><span class="pre">SamplingParams.min_length</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.min_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.min_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.n"><code class="docutils literal notranslate"><span class="pre">SamplingParams.n</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size"><code class="docutils literal notranslate"><span class="pre">SamplingParams.no_repeat_ngram_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.num_return_sequences"><code class="docutils literal notranslate"><span class="pre">SamplingParams.num_return_sequences</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.pad_id"><code class="docutils literal notranslate"><span class="pre">SamplingParams.pad_id</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.presence_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.presence_penalty</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.random_seed"><code class="docutils literal notranslate"><span class="pre">SamplingParams.random_seed</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.repetition_penalty"><code class="docutils literal notranslate"><span class="pre">SamplingParams.repetition_penalty</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.return_context_logits"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_context_logits</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.return_encoder_output"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_encoder_output</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.return_generation_logits"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_generation_logits</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.return_log_probs"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_log_probs</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.return_perf_metrics"><code class="docutils literal notranslate"><span class="pre">SamplingParams.return_perf_metrics</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.seed"><code class="docutils literal notranslate"><span class="pre">SamplingParams.seed</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.setup"><code class="docutils literal notranslate"><span class="pre">SamplingParams.setup()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.skip_special_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.skip_special_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.spaces_between_special_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.stop"><code class="docutils literal notranslate"><span class="pre">SamplingParams.stop</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.stop_token_ids"><code class="docutils literal notranslate"><span class="pre">SamplingParams.stop_token_ids</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.temperature"><code class="docutils literal notranslate"><span class="pre">SamplingParams.temperature</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.top_k"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_k</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.top_p"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.top_p_decay"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p_decay</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.top_p_min"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p_min</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids"><code class="docutils literal notranslate"><span class="pre">SamplingParams.top_p_reset_ids</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens"><code class="docutils literal notranslate"><span class="pre">SamplingParams.truncate_prompt_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams.use_beam_search"><code class="docutils literal notranslate"><span class="pre">SamplingParams.use_beam_search</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.__init__"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.cross_kv_cache_fraction</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.enable_block_reuse</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.event_buffer_max_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.fill_empty_fields_from_runtime_defaults"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.fill_empty_fields_from_runtime_defaults()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.free_gpu_memory_fraction</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.host_cache_size"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.host_cache_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.max_attention_window"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.max_attention_window</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.max_tokens"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.max_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.onboard_blocks</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.secondary_offload_min_priority</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig.sink_token_length"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig.sink_token_length</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig.calculate_speculative_resource()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig.max_ngram_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig.max_verification_set_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig.max_window_size</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig"><code class="docutils literal notranslate"><span class="pre">MedusaDecodingConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.__init__"><code class="docutils literal notranslate"><span class="pre">MedusaDecodingConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices"><code class="docutils literal notranslate"><span class="pre">MedusaDecodingConfig.medusa_choices</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads"><code class="docutils literal notranslate"><span class="pre">MedusaDecodingConfig.num_medusa_heads</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig.__init__"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.capacity_scheduler_policy</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.context_chunking_policy</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig.dynamic_batch_config</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.GUARANTEED_NO_EVICT</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.MAX_UTILIZATION</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.STATIC_BATCH</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.__init__"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.name"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.name</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.value"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy.value</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig"><code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.__init__"><code class="docutils literal notranslate"><span class="pre">BuildConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.auto_parallel_config"><code class="docutils literal notranslate"><span class="pre">BuildConfig.auto_parallel_config</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.dry_run"><code class="docutils literal notranslate"><span class="pre">BuildConfig.dry_run</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.enable_debug_output"><code class="docutils literal notranslate"><span class="pre">BuildConfig.enable_debug_output</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.force_num_profiles"><code class="docutils literal notranslate"><span class="pre">BuildConfig.force_num_profiles</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">BuildConfig.from_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.from_json_file"><code class="docutils literal notranslate"><span class="pre">BuildConfig.from_json_file()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">BuildConfig.gather_context_logits</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">BuildConfig.gather_generation_logits</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.input_timing_cache"><code class="docutils literal notranslate"><span class="pre">BuildConfig.input_timing_cache</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.kv_cache_type"><code class="docutils literal notranslate"><span class="pre">BuildConfig.kv_cache_type</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.lora_config"><code class="docutils literal notranslate"><span class="pre">BuildConfig.lora_config</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_batch_size"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_batch_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_beam_width"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_beam_width</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_draft_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_draft_len</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_encoder_input_len</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_input_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_input_len</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_num_tokens"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_num_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_prompt_embedding_table_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.max_seq_len"><code class="docutils literal notranslate"><span class="pre">BuildConfig.max_seq_len</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.monitor_memory"><code class="docutils literal notranslate"><span class="pre">BuildConfig.monitor_memory</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.opt_batch_size"><code class="docutils literal notranslate"><span class="pre">BuildConfig.opt_batch_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.opt_num_tokens"><code class="docutils literal notranslate"><span class="pre">BuildConfig.opt_num_tokens</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.output_timing_cache"><code class="docutils literal notranslate"><span class="pre">BuildConfig.output_timing_cache</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.plugin_config"><code class="docutils literal notranslate"><span class="pre">BuildConfig.plugin_config</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.profiling_verbosity"><code class="docutils literal notranslate"><span class="pre">BuildConfig.profiling_verbosity</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode"><code class="docutils literal notranslate"><span class="pre">BuildConfig.speculative_decoding_mode</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.strongly_typed"><code class="docutils literal notranslate"><span class="pre">BuildConfig.strongly_typed</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">BuildConfig.to_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.update"><code class="docutils literal notranslate"><span class="pre">BuildConfig.update()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.update_from_dict"><code class="docutils literal notranslate"><span class="pre">BuildConfig.update_from_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type"><code class="docutils literal notranslate"><span class="pre">BuildConfig.update_kv_cache_type()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.use_fused_mlp"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_fused_mlp</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.use_mrope"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_mrope</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.use_refit"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_refit</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.use_strip_plan"><code class="docutils literal notranslate"><span class="pre">BuildConfig.use_strip_plan</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.visualize_network"><code class="docutils literal notranslate"><span class="pre">BuildConfig.visualize_network</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.weight_sparsity"><code class="docutils literal notranslate"><span class="pre">BuildConfig.weight_sparsity</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig.weight_streaming"><code class="docutils literal notranslate"><span class="pre">BuildConfig.weight_streaming</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig"><code class="docutils literal notranslate"><span class="pre">QuantConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.__init__"><code class="docutils literal notranslate"><span class="pre">QuantConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.clamp_val"><code class="docutils literal notranslate"><span class="pre">QuantConfig.clamp_val</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.exclude_modules"><code class="docutils literal notranslate"><span class="pre">QuantConfig.exclude_modules</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">QuantConfig.from_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.get_modelopt_kv_cache_dtype"><code class="docutils literal notranslate"><span class="pre">QuantConfig.get_modelopt_kv_cache_dtype()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.get_modelopt_qformat"><code class="docutils literal notranslate"><span class="pre">QuantConfig.get_modelopt_qformat()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.get_quant_cfg"><code class="docutils literal notranslate"><span class="pre">QuantConfig.get_quant_cfg()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.group_size"><code class="docutils literal notranslate"><span class="pre">QuantConfig.group_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.has_zero_point"><code class="docutils literal notranslate"><span class="pre">QuantConfig.has_zero_point</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo"><code class="docutils literal notranslate"><span class="pre">QuantConfig.kv_cache_quant_algo</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.layer_quant_mode"><code class="docutils literal notranslate"><span class="pre">QuantConfig.layer_quant_mode</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.pre_quant_scale"><code class="docutils literal notranslate"><span class="pre">QuantConfig.pre_quant_scale</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.quant_algo"><code class="docutils literal notranslate"><span class="pre">QuantConfig.quant_algo</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.quant_mode"><code class="docutils literal notranslate"><span class="pre">QuantConfig.quant_mode</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.requires_calibration"><code class="docutils literal notranslate"><span class="pre">QuantConfig.requires_calibration</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.requires_modelopt_quantization"><code class="docutils literal notranslate"><span class="pre">QuantConfig.requires_modelopt_quantization</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.smoothquant_val"><code class="docutils literal notranslate"><span class="pre">QuantConfig.smoothquant_val</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">QuantConfig.to_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.use_meta_recipe"><code class="docutils literal notranslate"><span class="pre">QuantConfig.use_meta_recipe</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig.use_plugin_sq"><code class="docutils literal notranslate"><span class="pre">QuantConfig.use_plugin_sq</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo"><code class="docutils literal notranslate"><span class="pre">QuantAlgo</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.FP8"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.FP8</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.INT8"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.INT8</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.MIXED_PRECISION</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.NO_QUANT"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.NO_QUANT</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.NVFP4"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.NVFP4</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A16</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A16_AWQ</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A16_GPTQ</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A8_AWQ</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A8_QSERVE_PER_CHANNEL</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W4A8_QSERVE_PER_GROUP</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A16</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A16_GPTQ</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_CHANNEL</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN"><code class="docutils literal notranslate"><span class="pre">QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig"><code class="docutils literal notranslate"><span class="pre">CalibConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.__init__"><code class="docutils literal notranslate"><span class="pre">CalibConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.calib_batch_size"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_batch_size</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.calib_batches"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_batches</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.calib_dataset"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_dataset</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length"><code class="docutils literal notranslate"><span class="pre">CalibConfig.calib_max_seq_length</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.device"><code class="docutils literal notranslate"><span class="pre">CalibConfig.device</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">CalibConfig.from_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.random_seed"><code class="docutils literal notranslate"><span class="pre">CalibConfig.random_seed</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">CalibConfig.to_dict()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length"><code class="docutils literal notranslate"><span class="pre">CalibConfig.tokenizer_max_seq_length</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig</span></code></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig.cache_root"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.cache_root</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_records"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_records</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_cache_storage_gb</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig.__init__"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.__init__()</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id0"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.cache_root</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id1"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_cache_storage_gb</span></code></a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id2"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig.max_records</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestError"><code class="docutils literal notranslate"><span class="pre">RequestError</span></code></a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.llmapi.NoStatsAvailable"><code class="docutils literal notranslate"><span class="pre">NoStatsAvailable</span></code></a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-best-practices.html">Best Practices</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="../index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">API Reference</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="../_sources/llm-api/reference.rst.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<section id="api-reference">
|
||
<h1>API Reference<a class="headerlink" href="#api-reference" title="Link to this heading"></a></h1>
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">LLM</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">PreTrainedTokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'auto'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'slow'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">trust_remote_code</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Any</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<p>LLM class is the main class for running a LLM model.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>model</strong> (<em>str</em><em> or </em><em>Path</em>) – The model name or a local model directory.
|
||
Note that if the value could be both a model name or a local model directory,
|
||
the local model directory will be prioritized.</p></li>
|
||
<li><p><strong>tokenizer</strong> (<em>str</em><em>, </em><em>Path</em><em>, </em><em>TokenizerBase</em><em>, </em><em>PreTrainedTokenizerBase</em><em>, </em><em>optional</em>) – The name or path of a HuggingFace Transformers tokenizer, or the loaded tokenizer.
|
||
Defaults to None.</p></li>
|
||
<li><p><strong>tokenizer_mode</strong> (<em>Literal</em><em>[</em><em>'auto'</em><em>, </em><em>'slow'</em><em>]</em>) – The tokenizer mode.
|
||
‘auto’ will use the fast tokenizer if available, and ‘slow’ will always use the slow tokenizer.
|
||
The fast tokenizer is based on Huggingface’s Rust library tokenizers, which achieves a significant speed-up compared to its slow counterpart.
|
||
Defaults to ‘auto’.</p></li>
|
||
<li><p><strong>skip_tokenizer_init</strong> (<em>bool</em>) – If true, skip initialization of tokenizer and detokenizer.
|
||
LLM.generate and LLM.generate_async will accept prompt token ids as input only.
|
||
Defaults to False.</p></li>
|
||
<li><p><strong>trust_remote_code</strong> (<em>bool</em>) – Whether to trust remote code when downloading model and tokenizer from Hugging Face. Defaults to False.</p></li>
|
||
<li><p><strong>tensor_parallel_size</strong> (<em>int</em>) – The number of processes for tensor parallelism. Defaults to 1.</p></li>
|
||
<li><p><strong>dtype</strong> (<em>str</em>) – The data type for the model weights and activations.
|
||
Can be “float16”, “bfloat16”, “float32”, or “auto”. If “auto”, the data type
|
||
will be automatically inferred from the source model. If the source data type
|
||
is “float32”, it will be converted to “float16”. Defaults to “auto”.</p></li>
|
||
<li><p><strong>revision</strong> (<em>str</em><em>, </em><em>optional</em>) – The revision of the model to use. Defaults to None.</p></li>
|
||
<li><p><strong>tokenizer_revision</strong> (<em>str</em><em>, </em><em>optional</em>) – The revision of the tokenizer to use. Defaults to None.</p></li>
|
||
<li><p><strong>pipeline_parallel_size</strong> (<em>int</em>) – The pipeline parallel size. Defaults to 1.</p></li>
|
||
<li><p><strong>context_parallel_size</strong> (<em>int</em>) – The context parallel size. Defaults to 1.</p></li>
|
||
<li><p><strong>load_format</strong> (<em>Literal</em><em>[</em><em>'auto'</em><em>, </em><em>'dummy'</em><em>]</em>) – The format of the model weights to load.
|
||
* ‘auto’ will try to load the weights from the provided checkpoint.
|
||
* ‘dummy’ will initialize the weights with random values, which is mainly for profiling.
|
||
Defaults to ‘auto’.</p></li>
|
||
<li><p><strong>enable_tqdm</strong> (<em>bool</em>) – Whether to display a progress bar during model building. Defaults to False.</p></li>
|
||
<li><p><strong>enable_lora</strong> (<em>bool</em>) – Enable LoRA adapters. Defaults to False.</p></li>
|
||
<li><p><strong>max_lora_rank</strong> (<em>int</em><em>, </em><em>optional</em>) – Maximum LoRA rank. If specified, it overrides <cite>build_config.lora_config.max_lora_rank</cite>. Defaults to None.</p></li>
|
||
<li><p><strong>max_loras</strong> (<em>int</em>) – Maximum number of LoRA adapters to be stored in GPU memory. Defaults to 4.</p></li>
|
||
<li><p><strong>max_cpu_loras</strong> (<em>int</em>) – Maximum number of LoRA adapters to be stored in CPU memory. Defaults to 4.</p></li>
|
||
<li><p><strong>enable_prompt_adapter</strong> (<em>bool</em>) – Enable prompt adapters. Defaults to False.</p></li>
|
||
<li><p><strong>max_prompt_adapter_token</strong> (<em>int</em>) – Maximum number of prompt adapter tokens. Defaults to 0.</p></li>
|
||
<li><p><strong>quant_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig" title="tensorrt_llm.llmapi.QuantConfig"><em>QuantConfig</em></a><em>, </em><em>optional</em>) – The quantization configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>calib_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.CalibConfig"><em>CalibConfig</em></a><em>, </em><em>optional</em>) – The calibration configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>build_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig" title="tensorrt_llm.llmapi.BuildConfig"><em>BuildConfig</em></a><em>, </em><em>optional</em><em>)</em>) – The build configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>kv_cache_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig" title="tensorrt_llm.llmapi.KvCacheConfig"><em>KvCacheConfig</em></a><em>, </em><em>optional</em>) – The key-value cache configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>enable_chunked_prefill</strong> (<em>bool</em>) – Whether to enable chunked prefill. Defaults to False.</p></li>
|
||
<li><p><strong>decoding_config</strong> (<em>DecodingConfig</em><em>, </em><em>optional</em>) – The decoding configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>guided_decoding_backend</strong> (<em>str</em><em>, </em><em>optional</em>) – The guided decoding backend, currently supports ‘xgrammar’. Defaults to None.</p></li>
|
||
<li><p><strong>logits_post_processor_map</strong> (<em>Dict</em><em>[</em><em>str</em><em>, </em><em>Callable</em><em>]</em><em>, </em><em>optional</em>) – A map of logit post-processing functions. Defaults to None.</p></li>
|
||
<li><p><strong>iter_stats_max_iterations</strong> (<em>int</em><em>, </em><em>optional</em>) – The maximum number of iterations for iteration statistics. Defaults to None.</p></li>
|
||
<li><p><strong>request_stats_max_iterations</strong> (<em>int</em><em>, </em><em>optional</em>) – The maximum number of iterations for request statistics. Defaults to None.</p></li>
|
||
<li><p><strong>workspace</strong> (<em>str</em><em>, </em><em>optional</em>) – The directory to store intermediate files. Defaults to None.</p></li>
|
||
<li><p><strong>embedding_parallel_mode</strong> (<em>str</em>) – The parallel mode for embeddings. Defaults to ‘SHARDING_ALONG_VOCAB’.</p></li>
|
||
<li><p><strong>auto_parallel</strong> (<em>bool</em>) – Enable auto parallel mode. Defaults to False.</p></li>
|
||
<li><p><strong>auto_parallel_world_size</strong> (<em>int</em>) – The MPI world size for auto parallel. Defaults to 1.</p></li>
|
||
<li><p><strong>moe_tensor_parallel_size</strong> (<em>int</em><em>, </em><em>optional</em>) – The tensor parallel size for MoE models’s expert weights.</p></li>
|
||
<li><p><strong>moe_expert_parallel_size</strong> (<em>int</em><em>, </em><em>optional</em>) – The expert parallel size for MoE models’s expert weights.</p></li>
|
||
<li><p><strong>fast_build</strong> – (bool): Enable features for faster engine building.
|
||
This may cause some performance degradation and is currently incompatible with int8/int4 quantization.
|
||
Defaults to False.</p></li>
|
||
<li><p><strong>enable_build_cache</strong> (<em>bool</em><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig" title="tensorrt_llm.llmapi.BuildCacheConfig"><em>BuildCacheConfig</em></a><em>, </em><em>optional</em>) – Whether to enable build caching for the model. Defaults to None.</p></li>
|
||
<li><p><strong>peft_cache_config</strong> (<em>PeftCacheConfig</em><em>, </em><em>optional</em>) – The PEFT cache configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>scheduler_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig" title="tensorrt_llm.llmapi.SchedulerConfig"><em>SchedulerConfig</em></a><em>, </em><em>optional</em>) – The scheduler configuration for the model. Defaults to None.</p></li>
|
||
<li><p><strong>speculative_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.llmapi.LookaheadDecodingConfig"><em>LookaheadDecodingConfig</em></a><em> or </em><em>other speculative configurations</em><em>, </em><em>optional</em>) – The speculative decoding configuration. Defaults to None.</p></li>
|
||
<li><p><strong>batching_type</strong> (<em>BatchingType</em><em>, </em><em>optional</em>) – The batching type for the model. Defaults to None.</p></li>
|
||
<li><p><strong>normalize_log_probs</strong> (<em>bool</em>) – Whether to normalize log probabilities for the model. Defaults to False.</p></li>
|
||
<li><p><strong>max_batch_size</strong> (<em>int</em><em>, </em><em>optional</em>) – The maximum batch size for runtime. Defaults to None.</p></li>
|
||
<li><p><strong>max_num_tokens</strong> (<em>int</em><em>, </em><em>optional</em>) – The maximum number of tokens for runtime. Defaults to None.</p></li>
|
||
<li><p><strong>extended_runtime_perf_knob_config</strong> (<em>ExtendedRuntimePerfKnobConfig</em><em>, </em><em>optional</em>) – The extended runtime performance knob configuration for the model. Defaults to None.</p></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">PreTrainedTokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'auto'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'slow'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">trust_remote_code</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Any</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.generate">
|
||
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sampling_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_tqdm</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoRARequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">LoRARequest</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt_adapter_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PromptAdapterRequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">PromptAdapterRequest</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">queries</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.generate" title="Link to this definition"></a></dt>
|
||
<dd><p>Generate output for the given prompts in the synchronous mode.
|
||
Synchronous generation accepts either single prompt or batched prompts.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>inputs</strong> (<em>PromptInputs</em><em> or </em><em>Sequence</em><em>[</em><em>PromptInputs</em><em>]</em>) – The prompt text or token ids.
|
||
it can be single prompt or batched prompts.</p></li>
|
||
<li><p><strong>sampling_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.llmapi.SamplingParams"><em>SamplingParams</em></a><em>, </em><em>List</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.llmapi.SamplingParams"><em>SamplingParams</em></a><em>]</em><em>, </em><em>optional</em>) – The sampling params for the
|
||
generation, a default one will be used if not provided. Defaults to None.</p></li>
|
||
<li><p><strong>use_tqdm</strong> (<em>bool</em>) – Whether to use tqdm to display the progress bar. Defaults to True.</p></li>
|
||
<li><p><strong>lora_request</strong> (<em>LoRARequest</em><em>, </em><em>Sequence</em><em>[</em><em>LoRARequest</em><em>]</em><em>, </em><em>optional</em>) – LoRA request to use for generation,
|
||
if any. Defaults to None.</p></li>
|
||
<li><p><strong>prompt_adapter_request</strong> (<em>PromptAdapterRequest</em><em>, </em><em>Sequence</em><em>[</em><em>PromptAdapterRequest</em><em>]</em><em>, </em><em>optional</em>) – Prompt Adapter request to use for generation, if any. Defaults to None.</p></li>
|
||
<li><p><strong>queries</strong> (<em>PromptInputs</em><em> or </em><em>Sequence</em><em>[</em><em>PromptInputs</em><em>]</em>) – The query text or token ids.
|
||
it can be single prompt or batched prompts. it is used for star attention to run long context tasks.</p></li>
|
||
</ul>
|
||
</dd>
|
||
<dt class="field-even">Returns<span class="colon">:</span></dt>
|
||
<dd class="field-even"><p>The output data of the completion request to the LLM.</p>
|
||
</dd>
|
||
<dt class="field-odd">Return type<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><p>Union[<a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.RequestOutput">RequestOutput</a>, List[<a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.RequestOutput">RequestOutput</a>]]</p>
|
||
</dd>
|
||
</dl>
|
||
</dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.generate_async">
|
||
<span class="sig-name descname"><span class="pre">generate_async</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sampling_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoRARequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt_adapter_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PromptAdapterRequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">queries</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.generate_async"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.generate_async" title="Link to this definition"></a></dt>
|
||
<dd><p>Generate output for the given prompt in the asynchronous mode.
|
||
Asynchronous generation accepts single prompt only.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>inputs</strong> (<em>PromptInputs</em>) – The prompt text or token ids; it must be single prompt.</p></li>
|
||
<li><p><strong>sampling_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.llmapi.SamplingParams"><em>SamplingParams</em></a><em>, </em><em>optional</em>) – The sampling params for the generation,
|
||
a default one will be used if not provided. Defaults to None.</p></li>
|
||
<li><p><strong>lora_request</strong> (<em>LoRARequest</em><em>, </em><em>optional</em>) – LoRA request to use for generation, if any.
|
||
Defaults to None.</p></li>
|
||
<li><p><strong>prompt_adapter_request</strong> (<em>PromptAdapterRequest</em><em>, </em><em>optional</em>) – Prompt Adapter request to
|
||
use for generation, if any. Defaults to None.</p></li>
|
||
<li><p><strong>streaming</strong> (<em>bool</em>) – Whether to use the streaming mode for the generation. Defaults to
|
||
False.</p></li>
|
||
<li><p><strong>queries</strong> (<em>PromptInputs</em><em> or </em><em>Sequence</em><em>[</em><em>PromptInputs</em><em>]</em>) – The query text or token ids.
|
||
it can be single prompt or batched prompts. it is used for star attention to run long context tasks.</p></li>
|
||
</ul>
|
||
</dd>
|
||
<dt class="field-even">Returns<span class="colon">:</span></dt>
|
||
<dd class="field-even"><p>The output data of the completion request to the LLM.</p>
|
||
</dd>
|
||
<dt class="field-odd">Return type<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.RequestOutput">RequestOutput</a></p>
|
||
</dd>
|
||
</dl>
|
||
</dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.save">
|
||
<span class="sig-name descname"><span class="pre">save</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.save"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.save" title="Link to this definition"></a></dt>
|
||
<dd><p>Save the built engine to the given path.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><p><strong>engine_dir</strong> (<em>str</em>) – The path to save the engine.</p>
|
||
</dd>
|
||
<dt class="field-even">Returns<span class="colon">:</span></dt>
|
||
<dd class="field-even"><p>None</p>
|
||
</dd>
|
||
</dl>
|
||
</dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.tokenizer">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">tokenizer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.tokenizer" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.workspace">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">workspace</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Path</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.workspace" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">RequestOutput</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">generation_result</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">GenerationResult</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#RequestOutput"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DetokenizedGenerationResultBase</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">GenerationResult</span></code></p>
|
||
<p>The output data of a completion request to the LLM.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>request_id</strong> (<em>int</em>) – The unique ID of the request.</p></li>
|
||
<li><p><strong>prompt</strong> (<em>str</em><em>, </em><em>optional</em>) – The prompt string of the request.</p></li>
|
||
<li><p><strong>prompt_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em>) – The token ids of the prompt.</p></li>
|
||
<li><p><strong>outputs</strong> (<em>List</em><em>[</em><em>CompletionOutput</em><em>]</em>) – The output sequences of the request.</p></li>
|
||
<li><p><strong>context_logits</strong> (<em>torch.Tensor</em><em>, </em><em>optional</em>) – The logits on the prompt token ids.</p></li>
|
||
<li><p><strong>finished</strong> (<em>bool</em>) – Whether the whole request is finished.</p></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">generation_result</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">GenerationResult</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prompt</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#RequestOutput.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">GuidedDecodingParams</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">BaseModel</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">regex</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grammar</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">json_object</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/sampling_params.html#GuidedDecodingParams"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<p>Guided decoding parameters for text generation. Only one of the fields could be effective.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>json</strong> (<em>str</em><em>, </em><em>BaseModel</em><em>, </em><em>dict</em><em>, </em><em>optional</em>) – The generated text is amenable to json format with additional user-specified restrictions, namely schema. Defaults to None.</p></li>
|
||
<li><p><strong>regex</strong> (<em>str</em><em>, </em><em>optional</em>) – The generated text is amenable to the user-specified regular expression. Defaults to None.</p></li>
|
||
<li><p><strong>grammar</strong> (<em>str</em><em>, </em><em>optional</em>) – The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None.</p></li>
|
||
<li><p><strong>json_object</strong> (<em>bool</em>) – If True, the generated text is amenable to json format. Defaults to False.</p></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">BaseModel</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">regex</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">grammar</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">json_object</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.grammar">
|
||
<span class="sig-name descname"><span class="pre">grammar</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.grammar" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.json">
|
||
<span class="sig-name descname"><span class="pre">json</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">BaseModel</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.json_object">
|
||
<span class="sig-name descname"><span class="pre">json_object</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json_object" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.num_guides">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_guides</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.num_guides" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.regex">
|
||
<span class="sig-name descname"><span class="pre">regex</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.regex" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">SamplingParams</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">32</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_stop_str_in_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">embedding_bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">external_draft_tokens_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ExternalDraftTokensConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logits_post_processor_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">best_of</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_beam_search</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_return_sequences</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_search_diversity_rate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">no_repeat_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_context_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_input_from_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_perf_metrics</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">additional_model_outputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">AdditionalModelOutput</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lookahead_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.bindings.executor.LookaheadDecodingConfig"><span class="pre">LookaheadDecodingConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">guided_decoding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><span class="pre">GuidedDecodingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ignore_eos</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">detokenize</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">truncate_prompt_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">spaces_between_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/sampling_params.html#SamplingParams"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<p>Sampling parameters for text generation.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>end_id</strong> (<em>int</em><em>, </em><em>optional</em>) – The end token id. Defaults to None.</p></li>
|
||
<li><p><strong>pad_id</strong> (<em>int</em><em>, </em><em>optional</em>) – The pad token id. Defaults to None.</p></li>
|
||
<li><p><strong>max_tokens</strong> (<em>int</em>) – The maximum number of tokens to generate. Defaults to 32.</p></li>
|
||
<li><p><strong>max_new_tokens</strong> (<em>int</em><em>, </em><em>optional</em>) – The maximum number of tokens to generate. This argument is being deprecated; please use max_tokens instead. Defaults to None.</p></li>
|
||
<li><p><strong>bad</strong> (<em>str</em><em>, </em><em>List</em><em>[</em><em>str</em><em>]</em><em>, </em><em>optional</em>) – A string or a list of strings that redirect the generation when they are generated, so that the bad strings are excluded from the returned output. Defaults to None.</p></li>
|
||
<li><p><strong>bad_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em><em>, </em><em>optional</em>) – A list of token ids that redirect the generation when they are generated, so that the bad ids are excluded from the returned output. Defaults to None.</p></li>
|
||
<li><p><strong>stop</strong> (<em>str</em><em>, </em><em>List</em><em>[</em><em>str</em><em>]</em><em>, </em><em>optional</em>) – A string or a list of strings that stop the generation when they are generated. The returned output will not contain the stop strings unless include_stop_str_in_output is True. Defaults to None.</p></li>
|
||
<li><p><strong>stop_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em><em>, </em><em>optional</em>) – A list of token ids that stop the generation when they are generated. Defaults to None.</p></li>
|
||
<li><p><strong>include_stop_str_in_output</strong> (<em>bool</em>) – Whether to include the stop strings in output text. Defaults to False.</p></li>
|
||
<li><p><strong>embedding_bias</strong> (<em>torch.Tensor</em><em>, </em><em>optional</em>) – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. Defaults to None.</p></li>
|
||
<li><p><strong>external_draft_tokens_config</strong> (<em>ExternalDraftTokensConfig</em><em>, </em><em>optional</em>) – The speculative decoding configuration. Defaults to None.</p></li>
|
||
<li><p><strong>logits_post_processor_name</strong> (<em>str</em><em>, </em><em>optional</em>) – The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig. Defaults to None.</p></li>
|
||
<li><p><strong>n</strong> (<em>int</em>) – Number of sequences to generate. Defaults to 1.</p></li>
|
||
<li><p><strong>best_of</strong> (<em>int</em><em>, </em><em>optional</em>) – Number of sequences to consider for best output. Defaults to None.</p></li>
|
||
<li><p><strong>use_beam_search</strong> (<em>bool</em>) – Whether to use beam search. Defaults to False.</p></li>
|
||
<li><p><strong>beam_width</strong> (<em>int</em>) – The beam width. Setting 1 disables beam search. This parameter will be deprecated from the LLM API in a future release. Please use n/best_of/use_beam_search instead. Defaults to 1.</p></li>
|
||
<li><p><strong>num_return_sequences</strong> (<em>int</em><em>, </em><em>optional</em>) – The number of sequences to return. If set to None, it defaults to the value of <cite>beam_width</cite>. The default is None. This parameter will be deprecated from the LLM API in a future release. Please use n/best_of/use_beam_search instead. Defaults to None.</p></li>
|
||
<li><p><strong>top_k</strong> (<em>int</em>) – Controls number of logits to sample from. Default is 0 (all logits).</p></li>
|
||
<li><p><strong>top_p</strong> (<em>float</em>) – Controls the top-P probability to sample from. Default is 0.f</p></li>
|
||
<li><p><strong>top_p_min</strong> (<em>float</em>) – Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.</p></li>
|
||
<li><p><strong>top_p_reset_ids</strong> (<em>int</em>) – Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.</p></li>
|
||
<li><p><strong>top_p_decay</strong> (<em>float</em>) – Controls decay in the top-P algorithm. The decay value. Default is 1.f</p></li>
|
||
<li><p><strong>seed</strong> (<em>int</em>) – Controls the random seed used by the random number generator in sampling</p></li>
|
||
<li><p><strong>random_seed</strong> (<em>int</em>) – Controls the random seed used by the random number generator in sampling. This argument is being deprecated; please use seed instead.</p></li>
|
||
<li><p><strong>temperature</strong> (<em>float</em>) – Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f</p></li>
|
||
<li><p><strong>min_tokens</strong> (<em>int</em>) – Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.</p></li>
|
||
<li><p><strong>min_length</strong> (<em>int</em>) – Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1. This argument is being deprecated; please use min_tokens instead.</p></li>
|
||
<li><p><strong>beam_search_diversity_rate</strong> (<em>float</em>) – Controls the diversity in beam search.</p></li>
|
||
<li><p><strong>repetition_penalty</strong> (<em>float</em>) – Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f</p></li>
|
||
<li><p><strong>presence_penalty</strong> (<em>float</em>) – Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f</p></li>
|
||
<li><p><strong>frequency_penalty</strong> (<em>float</em>) – Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f</p></li>
|
||
<li><p><strong>length_penalty</strong> (<em>float</em>) – Controls how to penalize longer sequences in beam search. Default is 0.f</p></li>
|
||
<li><p><strong>early_stopping</strong> (<em>int</em>) – Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)</p></li>
|
||
<li><p><strong>no_repeat_ngram_size</strong> (<em>int</em>) – Controls how many repeat ngram size are acceptable. Default is 1 << 30.</p></li>
|
||
<li><p><strong>return_log_probs</strong> (<em>bool</em>) – Controls if Result should contain log probabilities. Default is false.</p></li>
|
||
<li><p><strong>return_context_logits</strong> (<em>bool</em>) – Controls if Result should contain the context logits. Default is false.</p></li>
|
||
<li><p><strong>return_generation_logits</strong> (<em>bool</em>) – Controls if Result should contain the generation logits. Default is false.</p></li>
|
||
<li><p><strong>exclude_input_from_output</strong> (<em>bool</em>) – Controls if output tokens in Result should include the input tokens. Default is true.</p></li>
|
||
<li><p><strong>return_encoder_output</strong> (<em>bool</em>) – Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false.</p></li>
|
||
<li><p><strong>return_perf_metrics</strong> (<em>bool</em>) – Controls if Result should contain the performance metrics for this request. Default is false.</p></li>
|
||
<li><p><strong>additional_model_outputs</strong> (<em>list</em><em>[</em><em>AdditionalModelOutput</em><em>]</em><em>, </em><em>optional</em>) – The additional outputs to gather from the model.</p></li>
|
||
<li><p><strong>lookahead_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.llmapi.LookaheadDecodingConfig"><em>LookaheadDecodingConfig</em></a><em> , </em><em>optional</em>) – Lookahead decoding config. Defaults to None.</p></li>
|
||
<li><p><strong>guided_decoding</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.llmapi.GuidedDecodingParams"><em>GuidedDecodingParams</em></a><em>, </em><em>optional</em>) – Guided decoding params. Defaults to None.</p></li>
|
||
<li><p><strong>ignore_eos</strong> (<em>bool</em>) – Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. Defaults to False.</p></li>
|
||
<li><p><strong>detokenize</strong> (<em>bool</em>) – Whether to detokenize the output. Defaults to True.</p></li>
|
||
<li><p><strong>add_special_tokens</strong> (<em>bool</em>) – Whether to add special tokens to the prompt. Defaults to True.</p></li>
|
||
<li><p><strong>truncate_prompt_tokens</strong> (<em>int</em><em>, </em><em>optional</em>) – If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None.</p></li>
|
||
<li><p><strong>skip_special_tokens</strong> (<em>bool</em>) – Whether to skip special tokens in the output. Defaults to True.</p></li>
|
||
<li><p><strong>spaces_between_special_tokens</strong> (<em>bool</em>) – Whether to add spaces between special tokens in the output. Defaults to True.</p></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">32</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_new_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bad_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stop_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_stop_str_in_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">embedding_bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">external_draft_tokens_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ExternalDraftTokensConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logits_post_processor_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">best_of</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_beam_search</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_width</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_return_sequences</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">beam_search_diversity_rate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">no_repeat_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_context_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_input_from_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_perf_metrics</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">additional_model_outputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">AdditionalModelOutput</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lookahead_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.bindings.executor.LookaheadDecodingConfig"><span class="pre">LookaheadDecodingConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">guided_decoding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><span class="pre">GuidedDecodingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ignore_eos</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">detokenize</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">truncate_prompt_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">spaces_between_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.add_special_tokens">
|
||
<span class="sig-name descname"><span class="pre">add_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.add_special_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.additional_model_outputs">
|
||
<span class="sig-name descname"><span class="pre">additional_model_outputs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">AdditionalModelOutput</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.additional_model_outputs" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.bad">
|
||
<span class="sig-name descname"><span class="pre">bad</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.bad" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.bad_token_ids">
|
||
<span class="sig-name descname"><span class="pre">bad_token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.bad_token_ids" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate">
|
||
<span class="sig-name descname"><span class="pre">beam_search_diversity_rate</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.beam_width">
|
||
<span class="sig-name descname"><span class="pre">beam_width</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.beam_width" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.best_of">
|
||
<span class="sig-name descname"><span class="pre">best_of</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.best_of" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.detokenize">
|
||
<span class="sig-name descname"><span class="pre">detokenize</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.detokenize" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.early_stopping">
|
||
<span class="sig-name descname"><span class="pre">early_stopping</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.early_stopping" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.embedding_bias">
|
||
<span class="sig-name descname"><span class="pre">embedding_bias</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.embedding_bias" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.end_id">
|
||
<span class="sig-name descname"><span class="pre">end_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.end_id" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output">
|
||
<span class="sig-name descname"><span class="pre">exclude_input_from_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.external_draft_tokens_config">
|
||
<span class="sig-name descname"><span class="pre">external_draft_tokens_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ExternalDraftTokensConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.external_draft_tokens_config" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.frequency_penalty">
|
||
<span class="sig-name descname"><span class="pre">frequency_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.frequency_penalty" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.greedy_decoding">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">greedy_decoding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.greedy_decoding" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.guided_decoding">
|
||
<span class="sig-name descname"><span class="pre">guided_decoding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><span class="pre">GuidedDecodingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.guided_decoding" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.ignore_eos">
|
||
<span class="sig-name descname"><span class="pre">ignore_eos</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.ignore_eos" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output">
|
||
<span class="sig-name descname"><span class="pre">include_stop_str_in_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.length_penalty">
|
||
<span class="sig-name descname"><span class="pre">length_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.length_penalty" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.logits_post_processor_name">
|
||
<span class="sig-name descname"><span class="pre">logits_post_processor_name</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.logits_post_processor_name" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.lookahead_config">
|
||
<span class="sig-name descname"><span class="pre">lookahead_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.bindings.executor.LookaheadDecodingConfig"><span class="pre">LookaheadDecodingConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.lookahead_config" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.max_new_tokens">
|
||
<span class="sig-name descname"><span class="pre">max_new_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.max_new_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.max_tokens">
|
||
<span class="sig-name descname"><span class="pre">max_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.max_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.min_length">
|
||
<span class="sig-name descname"><span class="pre">min_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.min_length" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.min_tokens">
|
||
<span class="sig-name descname"><span class="pre">min_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.min_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.n">
|
||
<span class="sig-name descname"><span class="pre">n</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.n" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size">
|
||
<span class="sig-name descname"><span class="pre">no_repeat_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.num_return_sequences">
|
||
<span class="sig-name descname"><span class="pre">num_return_sequences</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.num_return_sequences" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.pad_id">
|
||
<span class="sig-name descname"><span class="pre">pad_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.pad_id" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.presence_penalty">
|
||
<span class="sig-name descname"><span class="pre">presence_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.presence_penalty" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.random_seed">
|
||
<span class="sig-name descname"><span class="pre">random_seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.random_seed" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.repetition_penalty">
|
||
<span class="sig-name descname"><span class="pre">repetition_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.repetition_penalty" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_context_logits">
|
||
<span class="sig-name descname"><span class="pre">return_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_context_logits" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_encoder_output">
|
||
<span class="sig-name descname"><span class="pre">return_encoder_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_encoder_output" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_generation_logits">
|
||
<span class="sig-name descname"><span class="pre">return_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_generation_logits" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_log_probs">
|
||
<span class="sig-name descname"><span class="pre">return_log_probs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_log_probs" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_perf_metrics">
|
||
<span class="sig-name descname"><span class="pre">return_perf_metrics</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_perf_metrics" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.seed">
|
||
<span class="sig-name descname"><span class="pre">seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.seed" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.setup">
|
||
<span class="sig-name descname"><span class="pre">setup</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/sampling_params.html#SamplingParams.setup"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.setup" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.skip_special_tokens">
|
||
<span class="sig-name descname"><span class="pre">skip_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.skip_special_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens">
|
||
<span class="sig-name descname"><span class="pre">spaces_between_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.stop">
|
||
<span class="sig-name descname"><span class="pre">stop</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.stop" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.stop_token_ids">
|
||
<span class="sig-name descname"><span class="pre">stop_token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.stop_token_ids" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.temperature">
|
||
<span class="sig-name descname"><span class="pre">temperature</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.temperature" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_k">
|
||
<span class="sig-name descname"><span class="pre">top_k</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_k" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p">
|
||
<span class="sig-name descname"><span class="pre">top_p</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p_decay">
|
||
<span class="sig-name descname"><span class="pre">top_p_decay</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p_decay" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p_min">
|
||
<span class="sig-name descname"><span class="pre">top_p_min</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p_min" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids">
|
||
<span class="sig-name descname"><span class="pre">top_p_reset_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens">
|
||
<span class="sig-name descname"><span class="pre">truncate_prompt_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.use_beam_search">
|
||
<span class="sig-name descname"><span class="pre">use_beam_search</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.use_beam_search" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">KvCacheConfig</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig" title="tensorrt_llm.bindings.executor.KvCacheConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheConfig</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">enable_block_reuse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_attention_window</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sink_token_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">free_gpu_memory_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">host_cache_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">onboard_blocks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cross_kv_cache_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">secondary_offload_min_priority</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">event_buffer_max_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">runtime_defaults</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">tensorrt_llm.bindings.executor.RuntimeDefaults</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cross_kv_cache_fraction</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_block_reuse</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">event_buffer_max_size</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.fill_empty_fields_from_runtime_defaults">
|
||
<span class="sig-name descname"><span class="pre">fill_empty_fields_from_runtime_defaults</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig" title="tensorrt_llm.bindings.executor.KvCacheConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheConfig</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">tensorrt_llm.bindings.executor.RuntimeDefaults</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.fill_empty_fields_from_runtime_defaults" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">free_gpu_memory_fraction</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.host_cache_size">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">host_cache_size</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.host_cache_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.max_attention_window">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_attention_window</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.max_attention_window" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.max_tokens">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_tokens</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.max_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">onboard_blocks</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">secondary_offload_min_priority</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.sink_token_length">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">sink_token_length</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.sink_token_length" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">LookaheadDecodingConfig</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.bindings.executor.LookaheadDecodingConfig"><span class="pre">tensorrt_llm.bindings.executor.LookaheadDecodingConfig</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_window_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_verification_set_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource">
|
||
<span class="sig-name descname"><span class="pre">calculate_speculative_resource</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.bindings.executor.LookaheadDecodingConfig"><span class="pre">tensorrt_llm.bindings.executor.LookaheadDecodingConfig</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_ngram_size</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_verification_set_size</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_window_size</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">MedusaDecodingConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_medusa_heads</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_utils.html#MedusaDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_medusa_heads</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices">
|
||
<span class="sig-name descname"><span class="pre">medusa_choices</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads">
|
||
<span class="sig-name descname"><span class="pre">num_medusa_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">SchedulerConfig</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig" title="tensorrt_llm.bindings.executor.SchedulerConfig"><span class="pre">tensorrt_llm.bindings.executor.SchedulerConfig</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">capacity_scheduler_policy</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy" title="tensorrt_llm.bindings.executor.CapacitySchedulerPolicy"><span class="pre">tensorrt_llm.bindings.executor.CapacitySchedulerPolicy</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">CapacitySchedulerPolicy.GUARANTEED_NO_EVICT</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">context_chunking_policy</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">tensorrt_llm.bindings.executor.ContextChunkingPolicy</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dynamic_batch_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">tensorrt_llm.bindings.executor.DynamicBatchConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">capacity_scheduler_policy</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">context_chunking_policy</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dynamic_batch_config</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">CapacitySchedulerPolicy</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
|
||
<p>Members:</p>
|
||
<p>MAX_UTILIZATION</p>
|
||
<p>GUARANTEED_NO_EVICT</p>
|
||
<p>STATIC_BATCH</p>
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT">
|
||
<span class="sig-name descname"><span class="pre">GUARANTEED_NO_EVICT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre"><CapacitySchedulerPolicy.GUARANTEED_NO_EVICT:</span> <span class="pre">1></span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION">
|
||
<span class="sig-name descname"><span class="pre">MAX_UTILIZATION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre"><CapacitySchedulerPolicy.MAX_UTILIZATION:</span> <span class="pre">0></span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH">
|
||
<span class="sig-name descname"><span class="pre">STATIC_BATCH</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre"><CapacitySchedulerPolicy.STATIC_BATCH:</span> <span class="pre">2></span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy" title="tensorrt_llm.bindings.executor.CapacitySchedulerPolicy"><span class="pre">tensorrt_llm.bindings.executor.CapacitySchedulerPolicy</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.name">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">name</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.name" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.value">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">value</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.value" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">BuildConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">2048</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8192</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_num_tokens:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">tensorrt_llm.bindings.KVCacheType</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strongly_typed:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">force_num_profiles:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">profiling_verbosity:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'layer_names_only'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">enable_debug_output:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_draft_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">speculative_decoding_mode:</span> <span class="pre">tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode</span> <span class="pre">=</span> <span class="pre"><SpeculativeDecodingMode.NONE:</span> <span class="pre">1></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_refit:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'model.cache'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">=</span> <span class="pre"><factory></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auto_parallel_config:</span> <span class="pre">tensorrt_llm.auto_parallel.config.AutoParallelConfig</span> <span class="pre">=</span> <span class="pre"><factory></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_sparsity:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_streaming:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config:</span> <span class="pre">tensorrt_llm.plugin.plugin.PluginConfig</span> <span class="pre">=</span> <span class="pre"><factory></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_strip_plan:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_encoder_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_fused_mlp:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dry_run:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">visualize_network:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">monitor_memory:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_mrope:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">2048</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8192</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">opt_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">~tensorrt_llm.bindings.KVCacheType</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">strongly_typed:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">force_num_profiles:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">profiling_verbosity:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'layer_names_only'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">enable_debug_output:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_draft_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">speculative_decoding_mode:</span> <span class="pre">~tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode</span> <span class="pre">=</span> <span class="pre"><SpeculativeDecodingMode.NONE:</span> <span class="pre">1></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_refit:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'model.cache'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">~tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">=</span> <span class="pre"><factory></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auto_parallel_config:</span> <span class="pre">~tensorrt_llm.auto_parallel.config.AutoParallelConfig</span> <span class="pre">=</span> <span class="pre"><factory></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_sparsity:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_streaming:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config:</span> <span class="pre">~tensorrt_llm.plugin.plugin.PluginConfig</span> <span class="pre">=</span> <span class="pre"><factory></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_strip_plan:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_encoder_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_fused_mlp:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dry_run:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">visualize_network:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">monitor_memory:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_mrope:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.auto_parallel_config">
|
||
<span class="sig-name descname"><span class="pre">auto_parallel_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">AutoParallelConfig</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.auto_parallel_config" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.dry_run">
|
||
<span class="sig-name descname"><span class="pre">dry_run</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.dry_run" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.enable_debug_output">
|
||
<span class="sig-name descname"><span class="pre">enable_debug_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.enable_debug_output" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.force_num_profiles">
|
||
<span class="sig-name descname"><span class="pre">force_num_profiles</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.force_num_profiles" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.from_dict">
|
||
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.from_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.from_json_file">
|
||
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_json_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.from_json_file"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.from_json_file" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.gather_context_logits">
|
||
<span class="sig-name descname"><span class="pre">gather_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.gather_context_logits" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.gather_generation_logits">
|
||
<span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.gather_generation_logits" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.input_timing_cache">
|
||
<span class="sig-name descname"><span class="pre">input_timing_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.input_timing_cache" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.kv_cache_type">
|
||
<span class="sig-name descname"><span class="pre">kv_cache_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">KVCacheType</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.kv_cache_type" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.lora_config">
|
||
<span class="sig-name descname"><span class="pre">lora_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">LoraConfig</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.lora_config" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_batch_size">
|
||
<span class="sig-name descname"><span class="pre">max_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">2048</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_batch_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_beam_width">
|
||
<span class="sig-name descname"><span class="pre">max_beam_width</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_beam_width" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_draft_len">
|
||
<span class="sig-name descname"><span class="pre">max_draft_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_draft_len" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len">
|
||
<span class="sig-name descname"><span class="pre">max_encoder_input_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1024</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_input_len">
|
||
<span class="sig-name descname"><span class="pre">max_input_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1024</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_input_len" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_num_tokens">
|
||
<span class="sig-name descname"><span class="pre">max_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">8192</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_num_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size">
|
||
<span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_seq_len">
|
||
<span class="sig-name descname"><span class="pre">max_seq_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_seq_len" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.monitor_memory">
|
||
<span class="sig-name descname"><span class="pre">monitor_memory</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.monitor_memory" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.opt_batch_size">
|
||
<span class="sig-name descname"><span class="pre">opt_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">8</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.opt_batch_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.opt_num_tokens">
|
||
<span class="sig-name descname"><span class="pre">opt_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.opt_num_tokens" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.output_timing_cache">
|
||
<span class="sig-name descname"><span class="pre">output_timing_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'model.cache'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.output_timing_cache" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.plugin_config">
|
||
<span class="sig-name descname"><span class="pre">plugin_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html#tensorrt_llm.plugin.PluginConfig" title="tensorrt_llm.plugin.plugin.PluginConfig"><span class="pre">PluginConfig</span></a></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.plugin_config" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.profiling_verbosity">
|
||
<span class="sig-name descname"><span class="pre">profiling_verbosity</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'layer_names_only'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.profiling_verbosity" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode">
|
||
<span class="sig-name descname"><span class="pre">speculative_decoding_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.models.html#tensorrt_llm.models.SpeculativeDecodingMode" title="tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode"><span class="pre">SpeculativeDecodingMode</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.strongly_typed">
|
||
<span class="sig-name descname"><span class="pre">strongly_typed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.strongly_typed" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.to_dict">
|
||
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.to_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.update">
|
||
<span class="sig-name descname"><span class="pre">update</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.update" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.update_from_dict">
|
||
<span class="sig-name descname"><span class="pre">update_from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update_from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.update_from_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type">
|
||
<span class="sig-name descname"><span class="pre">update_kv_cache_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model_architecture</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update_kv_cache_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_fused_mlp">
|
||
<span class="sig-name descname"><span class="pre">use_fused_mlp</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_fused_mlp" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_mrope">
|
||
<span class="sig-name descname"><span class="pre">use_mrope</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_mrope" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_refit">
|
||
<span class="sig-name descname"><span class="pre">use_refit</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_refit" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_strip_plan">
|
||
<span class="sig-name descname"><span class="pre">use_strip_plan</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_strip_plan" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.visualize_network">
|
||
<span class="sig-name descname"><span class="pre">visualize_network</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.visualize_network" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.weight_sparsity">
|
||
<span class="sig-name descname"><span class="pre">weight_sparsity</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.weight_sparsity" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.weight_streaming">
|
||
<span class="sig-name descname"><span class="pre">weight_streaming</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.weight_streaming" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">QuantConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">128</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smoothquant_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">clamp_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_meta_recipe</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">has_zero_point</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pre_quant_scale</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_modules</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<p>Serializable quantization configuration class, part of the PretrainedConfig</p>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kv_cache_quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">128</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smoothquant_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">clamp_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_meta_recipe</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">has_zero_point</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pre_quant_scale</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">exclude_modules</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.clamp_val">
|
||
<span class="sig-name descname"><span class="pre">clamp_val</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.clamp_val" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.exclude_modules">
|
||
<span class="sig-name descname"><span class="pre">exclude_modules</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.exclude_modules" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.from_dict">
|
||
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.from_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.get_modelopt_kv_cache_dtype">
|
||
<span class="sig-name descname"><span class="pre">get_modelopt_kv_cache_dtype</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.get_modelopt_kv_cache_dtype"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.get_modelopt_kv_cache_dtype" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.get_modelopt_qformat">
|
||
<span class="sig-name descname"><span class="pre">get_modelopt_qformat</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.get_modelopt_qformat"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.get_modelopt_qformat" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.get_quant_cfg">
|
||
<span class="sig-name descname"><span class="pre">get_quant_cfg</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">module_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.get_quant_cfg"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.get_quant_cfg" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.group_size">
|
||
<span class="sig-name descname"><span class="pre">group_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">128</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.group_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.has_zero_point">
|
||
<span class="sig-name descname"><span class="pre">has_zero_point</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.has_zero_point" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo">
|
||
<span class="sig-name descname"><span class="pre">kv_cache_quant_algo</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.layer_quant_mode">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">layer_quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantMode" title="tensorrt_llm.quantization.mode.QuantMode"><span class="pre">QuantMode</span></a></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.layer_quant_mode" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.pre_quant_scale">
|
||
<span class="sig-name descname"><span class="pre">pre_quant_scale</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.pre_quant_scale" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.quant_algo">
|
||
<span class="sig-name descname"><span class="pre">quant_algo</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.quant_algo" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.quant_mode">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">QuantModeWrapper</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.quant_mode" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.requires_calibration">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">requires_calibration</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.requires_calibration" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.requires_modelopt_quantization">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">requires_modelopt_quantization</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.requires_modelopt_quantization" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.smoothquant_val">
|
||
<span class="sig-name descname"><span class="pre">smoothquant_val</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.5</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.smoothquant_val" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.to_dict">
|
||
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.to_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.use_meta_recipe">
|
||
<span class="sig-name descname"><span class="pre">use_meta_recipe</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.use_meta_recipe" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.use_plugin_sq">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_plugin_sq</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.use_plugin_sq" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">QuantAlgo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">names=<not</span> <span class="pre">given></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">*values</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">module=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">qualname=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">type=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">start=1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">boundary=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/quantization/mode.html#QuantAlgo"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.FP8">
|
||
<span class="sig-name descname"><span class="pre">FP8</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.FP8" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN">
|
||
<span class="sig-name descname"><span class="pre">FP8_PER_CHANNEL_PER_TOKEN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8_PER_CHANNEL_PER_TOKEN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.INT8">
|
||
<span class="sig-name descname"><span class="pre">INT8</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'INT8'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.INT8" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION">
|
||
<span class="sig-name descname"><span class="pre">MIXED_PRECISION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'MIXED_PRECISION'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.NO_QUANT">
|
||
<span class="sig-name descname"><span class="pre">NO_QUANT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'NO_QUANT'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.NO_QUANT" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.NVFP4">
|
||
<span class="sig-name descname"><span class="pre">NVFP4</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'NVFP4'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.NVFP4" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A16">
|
||
<span class="sig-name descname"><span class="pre">W4A16</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ">
|
||
<span class="sig-name descname"><span class="pre">W4A16_AWQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16_AWQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ">
|
||
<span class="sig-name descname"><span class="pre">W4A16_GPTQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16_GPTQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ">
|
||
<span class="sig-name descname"><span class="pre">W4A8_AWQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_AWQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL">
|
||
<span class="sig-name descname"><span class="pre">W4A8_QSERVE_PER_CHANNEL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_QSERVE_PER_CHANNEL'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP">
|
||
<span class="sig-name descname"><span class="pre">W4A8_QSERVE_PER_GROUP</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_QSERVE_PER_GROUP'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A16">
|
||
<span class="sig-name descname"><span class="pre">W8A16</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A16'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ">
|
||
<span class="sig-name descname"><span class="pre">W8A16_GPTQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A16_GPTQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL">
|
||
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN">
|
||
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN">
|
||
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN">
|
||
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN">
|
||
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_TENSOR_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_TENSOR_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">CalibConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cuda'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cnn_dailymail'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batches</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1234</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2048</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_utils.html#CalibConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<p>Calibration configuration.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><ul class="simple">
|
||
<li><p><strong>device</strong> (<em>Literal</em><em>[</em><em>'cuda'</em><em>, </em><em>'cpu'</em><em>]</em><em>, </em><em>default='cuda'</em>) – The device to run calibration.</p></li>
|
||
<li><p><strong>calib_dataset</strong> (<em>str</em><em>, </em><em>default='cnn_dailymail'</em>) – The name or local path of calibration dataset.</p></li>
|
||
<li><p><strong>calib_batches</strong> (<em>int</em><em>, </em><em>default=512</em>) – The number of batches that the calibration runs.</p></li>
|
||
<li><p><strong>calib_batch_size</strong> (<em>int</em><em>, </em><em>default=1</em>) – The batch size that the calibration runs.</p></li>
|
||
<li><p><strong>calib_max_seq_length</strong> (<em>int</em><em>, </em><em>default=512</em>) – The maximum sequence length that the calibration runs.</p></li>
|
||
<li><p><strong>random_seed</strong> (<em>int</em><em>, </em><em>default=1234</em>) – The random seed used for calibration.</p></li>
|
||
<li><p><strong>tokenizer_max_seq_length</strong> (<em>int</em><em>, </em><em>default=2048</em>) – The maximum sequence length to initialize tokenizer for calibration.</p></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cuda'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cnn_dailymail'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batches</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">calib_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1234</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tokenizer_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2048</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_batch_size">
|
||
<span class="sig-name descname"><span class="pre">calib_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_batch_size" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_batches">
|
||
<span class="sig-name descname"><span class="pre">calib_batches</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_batches" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_dataset">
|
||
<span class="sig-name descname"><span class="pre">calib_dataset</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_dataset" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length">
|
||
<span class="sig-name descname"><span class="pre">calib_max_seq_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.device">
|
||
<span class="sig-name descname"><span class="pre">device</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.device" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.from_dict">
|
||
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_utils.html#CalibConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.from_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.random_seed">
|
||
<span class="sig-name descname"><span class="pre">random_seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.random_seed" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.to_dict">
|
||
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_utils.html#CalibConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.to_dict" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length">
|
||
<span class="sig-name descname"><span class="pre">tokenizer_max_seq_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">BuildCacheConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache_root</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_records</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_cache_storage_gb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">256</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/build_cache.html#BuildCacheConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||
<p>Configuration for the build cache.</p>
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.cache_root">
|
||
<span class="sig-name descname"><span class="pre">cache_root</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.cache_root" title="Link to this definition"></a></dt>
|
||
<dd><p>The root directory for the build cache.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Type<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><p>str</p>
|
||
</dd>
|
||
</dl>
|
||
</dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.max_records">
|
||
<span class="sig-name descname"><span class="pre">max_records</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_records" title="Link to this definition"></a></dt>
|
||
<dd><p>The maximum number of records to store in the cache.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Type<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><p>int</p>
|
||
</dd>
|
||
</dl>
|
||
</dd></dl>
|
||
|
||
<dl class="py attribute">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb">
|
||
<span class="sig-name descname"><span class="pre">max_cache_storage_gb</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb" title="Link to this definition"></a></dt>
|
||
<dd><p>The maximum amount of storage (in GB) to use for the cache.</p>
|
||
<dl class="field-list simple">
|
||
<dt class="field-odd">Type<span class="colon">:</span></dt>
|
||
<dd class="field-odd"><p>float</p>
|
||
</dd>
|
||
</dl>
|
||
</dd></dl>
|
||
|
||
<div class="admonition note">
|
||
<p class="admonition-title">Note</p>
|
||
<p>The build-cache assumes the weights of the model are not changed during the execution. If the weights are
|
||
changed, you should remove the caches manually.</p>
|
||
</div>
|
||
<dl class="py method">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.__init__">
|
||
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cache_root</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_records</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">10</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_cache_storage_gb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">256</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/build_cache.html#BuildCacheConfig.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.__init__" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="id0">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cache_root</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Path</span></em><a class="headerlink" href="#id0" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="id1">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_cache_storage_gb</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><a class="headerlink" href="#id1" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
<dl class="py property">
|
||
<dt class="sig sig-object py" id="id2">
|
||
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_records</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#id2" title="Link to this definition"></a></dt>
|
||
<dd></dd></dl>
|
||
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestError">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">RequestError</span></span><a class="reference internal" href="../_modules/tensorrt_llm/executor.html#RequestError"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.RequestError" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">RuntimeError</span></code></p>
|
||
<p>The error raised when the request is failed.</p>
|
||
</dd></dl>
|
||
|
||
<dl class="py class">
|
||
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NoStatsAvailable">
|
||
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">NoStatsAvailable</span></span><a class="reference internal" href="../_modules/tensorrt_llm/executor.html#NoStatsAvailable"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.NoStatsAvailable" title="Link to this definition"></a></dt>
|
||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">Exception</span></code></p>
|
||
</dd></dl>
|
||
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="index.html" class="btn btn-neutral float-left" title="API Introduction" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="../llm-api-examples/index.html" class="btn btn-neutral float-right" title="LLM Examples Introduction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<jinja2.runtime.BlockReference object at 0x7da75a1d3f50>
|
||
|
||
<div class="footer">
|
||
<p>
|
||
Copyright © 2024 NVIDIA Corporation
|
||
</p>
|
||
<p>
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Privacy Policy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Manage My Privacy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Accessibility</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Product Security</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Contact</a>
|
||
</p>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |