mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
662 lines
81 KiB
HTML
662 lines
81 KiB
HTML
|
|
|
|
<!DOCTYPE html>
|
|
<html class="writer-html5" lang="en" data-content_root="../../">
|
|
<head>
|
|
<meta charset="utf-8" />
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
<title>tensorrt_llm.sampling_params — tensorrt_llm documentation</title>
|
|
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=80d5e7a1" />
|
|
<link rel="stylesheet" type="text/css" href="../../_static/css/theme.css?v=e59714d7" />
|
|
<link rel="stylesheet" type="text/css" href="../../_static/copybutton.css?v=76b2166b" />
|
|
|
|
|
|
<script src="../../_static/jquery.js?v=5d32c60e"></script>
|
|
<script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
|
<script src="../../_static/documentation_options.js?v=5929fcd5"></script>
|
|
<script src="../../_static/doctools.js?v=9a2dae69"></script>
|
|
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
|
|
<script src="../../_static/clipboard.min.js?v=a7894cd8"></script>
|
|
<script src="../../_static/copybutton.js?v=65e89d2a"></script>
|
|
<script src="../../_static/js/theme.js"></script>
|
|
<link rel="index" title="Index" href="../../genindex.html" />
|
|
<link rel="search" title="Search" href="../../search.html" />
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav">
|
|
<div class="wy-grid-for-nav">
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
|
<div class="wy-side-scroll">
|
|
<div class="wy-side-nav-search" >
|
|
|
|
|
|
|
|
<a href="../../index.html" class="icon icon-home">
|
|
tensorrt_llm
|
|
</a>
|
|
<div role="search">
|
|
<form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
|
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../overview.html">Overview</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../quick-start-guide.html">Quick Start Guide</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../key-features.html">Key Features</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../release-notes.html">Release Notes</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../installation/linux.html">Installing on Linux</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../installation/windows.html">Installing on Windows</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../llm-api/index.html">API Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../llm-api/reference.html">API Reference</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../llm-api-examples/index.html">LLM Examples Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../llm-api-examples/customization.html">Common Customizations</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../llm-api-examples/llm_api_examples.html">Examples</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.layers.html">Layers</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.models.html">Models</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../_cpp_gen/executor.html">Executor</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../_cpp_gen/runtime.html">Runtime</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/core-concepts.html">Model Definition</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/core-concepts.html#compilation">Compilation</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/core-concepts.html#runtime">Runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../architecture/add-model.html">Adding a Model</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/executor.html">Executor API</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/inference-request.html">Inference Request</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/inference-request.html#responses">Responses</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-overview.html">Overview</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-benchmarking.html">Benchmarking</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-best-practices.html">Best Practices</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-analysis.html">Performance Analysis</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../reference/troubleshooting.html">Troubleshooting</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../reference/support-matrix.html">Support Matrix</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../reference/precision.html">Numerical Precision</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
|
</ul>
|
|
|
|
</div>
|
|
</div>
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="../../index.html">tensorrt_llm</a>
|
|
</nav>
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="Page navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
|
<li class="breadcrumb-item"><a href="../index.html">Module code</a></li>
|
|
<li class="breadcrumb-item active">tensorrt_llm.sampling_params</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
</div>
|
|
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
|
<div itemprop="articleBody">
|
|
|
|
<h1>Source code for tensorrt_llm.sampling_params</h1><div class="highlight"><pre>
|
|
<span></span><span class="kn">import</span> <span class="nn">json</span>
|
|
<span class="kn">import</span> <span class="nn">os</span>
|
|
<span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">fields</span>
|
|
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span>
|
|
|
|
<span class="kn">import</span> <span class="nn">torch</span>
|
|
<span class="kn">from</span> <span class="nn">pydantic</span> <span class="kn">import</span> <span class="n">BaseModel</span>
|
|
|
|
<span class="kn">from</span> <span class="nn">tensorrt_llm.bindings</span> <span class="kn">import</span> <span class="n">executor</span> <span class="k">as</span> <span class="n">tllme</span>
|
|
<span class="kn">from</span> <span class="nn">tensorrt_llm.logger</span> <span class="kn">import</span> <span class="n">logger</span>
|
|
|
|
|
|
<div class="viewcode-block" id="GuidedDecodingParams">
|
|
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.GuidedDecodingParams">[docs]</a>
|
|
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
|
<span class="k">class</span> <span class="nc">GuidedDecodingParams</span><span class="p">:</span>
|
|
<span class="w"> </span><span class="sd">"""</span>
|
|
<span class="sd"> Guided decoding parameters for text generation. Only one of the fields could be effective.</span>
|
|
|
|
<span class="sd"> Args:</span>
|
|
<span class="sd"> json (str, BaseModel, dict, optional): The generated text is amenable to json format with additional user-specified restrictions, namely schema. Defaults to None.</span>
|
|
<span class="sd"> regex (str, optional): The generated text is amenable to the user-specified regular expression. Defaults to None.</span>
|
|
<span class="sd"> grammar (str, optional): The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None.</span>
|
|
<span class="sd"> json_object (bool): If True, the generated text is amenable to json format. Defaults to False.</span>
|
|
<span class="sd"> """</span>
|
|
<span class="n">json</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="nb">dict</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">regex</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">grammar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">json_object</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
|
|
<span class="nd">@property</span>
|
|
<span class="k">def</span> <span class="nf">num_guides</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
|
<span class="n">num_guides</span> <span class="o">=</span> <span class="mi">0</span>
|
|
<span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">fields</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
|
<span class="n">num_guides</span> <span class="o">+=</span> <span class="nb">bool</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
|
|
<span class="k">return</span> <span class="n">num_guides</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
|
<span class="k">if</span> <span class="p">(</span><span class="n">num_guides</span> <span class="o">:=</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_guides</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
|
|
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s2">"Only one guide can be used for a request, but got </span><span class="si">{</span><span class="n">num_guides</span><span class="si">}</span><span class="s2">."</span>
|
|
<span class="p">)</span></div>
|
|
|
|
|
|
|
|
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
|
<span class="k">class</span> <span class="nc">AdditionalModelOutput</span><span class="p">:</span>
|
|
<span class="w"> </span><span class="sd">"""</span>
|
|
<span class="sd"> An additional output to gather from the model.</span>
|
|
|
|
<span class="sd"> Args:</span>
|
|
<span class="sd"> name (str): The name of the additional output to gather from the model.</span>
|
|
<span class="sd"> gather_context (bool): A value indicating whether or not to gather the additional output from the context too. Defaults to False.</span>
|
|
<span class="sd"> """</span>
|
|
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span>
|
|
<span class="n">gather_context</span><span class="p">:</span> <span class="nb">bool</span>
|
|
|
|
|
|
<div class="viewcode-block" id="SamplingParams">
|
|
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams">[docs]</a>
|
|
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
|
<span class="k">class</span> <span class="nc">SamplingParams</span><span class="p">:</span>
|
|
<span class="w"> </span><span class="sd">"""</span>
|
|
<span class="sd"> Sampling parameters for text generation.</span>
|
|
|
|
<span class="sd"> Args:</span>
|
|
<span class="sd"> end_id (int, optional): The end token id. Defaults to None.</span>
|
|
<span class="sd"> pad_id (int, optional): The pad token id. Defaults to None.</span>
|
|
<span class="sd"> max_tokens (int): The maximum number of tokens to generate. Defaults to 32.</span>
|
|
<span class="sd"> max_new_tokens (int, optional): The maximum number of tokens to generate. This argument is being deprecated; please use max_tokens instead. Defaults to None.</span>
|
|
<span class="sd"> bad (str, List[str], optional): A string or a list of strings that redirect the generation when they are generated, so that the bad strings are excluded from the returned output. Defaults to None.</span>
|
|
<span class="sd"> bad_token_ids (List[int], optional): A list of token ids that redirect the generation when they are generated, so that the bad ids are excluded from the returned output. Defaults to None.</span>
|
|
<span class="sd"> stop (str, List[str], optional): A string or a list of strings that stop the generation when they are generated. The returned output will not contain the stop strings unless include_stop_str_in_output is True. Defaults to None.</span>
|
|
<span class="sd"> stop_token_ids (List[int], optional): A list of token ids that stop the generation when they are generated. Defaults to None.</span>
|
|
<span class="sd"> include_stop_str_in_output (bool): Whether to include the stop strings in output text. Defaults to False.</span>
|
|
<span class="sd"> embedding_bias (torch.Tensor, optional): The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. Defaults to None.</span>
|
|
<span class="sd"> external_draft_tokens_config (ExternalDraftTokensConfig, optional): The speculative decoding configuration. Defaults to None.</span>
|
|
<span class="sd"> logits_post_processor_name (str, optional): The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig. Defaults to None.</span>
|
|
|
|
<span class="sd"> n (int): Number of sequences to generate. Defaults to 1.</span>
|
|
<span class="sd"> best_of (int, optional): Number of sequences to consider for best output. Defaults to None.</span>
|
|
<span class="sd"> use_beam_search (bool): Whether to use beam search. Defaults to False.</span>
|
|
|
|
<span class="sd"> beam_width (int): The beam width. Setting 1 disables beam search. This parameter will be deprecated from the LLM API in a future release. Please use n/best_of/use_beam_search instead. Defaults to 1.</span>
|
|
<span class="sd"> num_return_sequences (int, optional): The number of sequences to return. If set to None, it defaults to the value of `beam_width`. The default is None. This parameter will be deprecated from the LLM API in a future release. Please use n/best_of/use_beam_search instead. Defaults to None.</span>
|
|
|
|
<span class="sd"> top_k (int): Controls number of logits to sample from. Default is 0 (all logits).</span>
|
|
<span class="sd"> top_p (float): Controls the top-P probability to sample from. Default is 0.f</span>
|
|
<span class="sd"> top_p_min (float): Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.</span>
|
|
<span class="sd"> top_p_reset_ids (int): Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.</span>
|
|
<span class="sd"> top_p_decay (float): Controls decay in the top-P algorithm. The decay value. Default is 1.f</span>
|
|
<span class="sd"> seed (int): Controls the random seed used by the random number generator in sampling</span>
|
|
<span class="sd"> random_seed (int): Controls the random seed used by the random number generator in sampling. This argument is being deprecated; please use seed instead.</span>
|
|
<span class="sd"> temperature (float): Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f</span>
|
|
<span class="sd"> min_tokens (int): Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.</span>
|
|
<span class="sd"> min_length (int): Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1. This argument is being deprecated; please use min_tokens instead.</span>
|
|
<span class="sd"> beam_search_diversity_rate (float): Controls the diversity in beam search.</span>
|
|
<span class="sd"> repetition_penalty (float): Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f</span>
|
|
<span class="sd"> presence_penalty (float): Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f</span>
|
|
<span class="sd"> frequency_penalty (float): Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f</span>
|
|
<span class="sd"> length_penalty (float): Controls how to penalize longer sequences in beam search. Default is 0.f</span>
|
|
<span class="sd"> early_stopping (int): Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)</span>
|
|
<span class="sd"> no_repeat_ngram_size (int): Controls how many repeat ngram size are acceptable. Default is 1 << 30.</span>
|
|
|
|
<span class="sd"> return_log_probs (bool): Controls if Result should contain log probabilities. Default is false.</span>
|
|
<span class="sd"> return_context_logits (bool): Controls if Result should contain the context logits. Default is false.</span>
|
|
<span class="sd"> return_generation_logits (bool): Controls if Result should contain the generation logits. Default is false.</span>
|
|
<span class="sd"> exclude_input_from_output (bool): Controls if output tokens in Result should include the input tokens. Default is true.</span>
|
|
<span class="sd"> return_encoder_output (bool): Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false.</span>
|
|
<span class="sd"> return_perf_metrics (bool): Controls if Result should contain the performance metrics for this request. Default is false.</span>
|
|
<span class="sd"> additional_model_outputs (list[AdditionalModelOutput], optional): The additional outputs to gather from the model.</span>
|
|
|
|
<span class="sd"> lookahead_config (LookaheadDecodingConfig , optional): Lookahead decoding config. Defaults to None.</span>
|
|
<span class="sd"> guided_decoding (GuidedDecodingParams, optional): Guided decoding params. Defaults to None.</span>
|
|
|
|
<span class="sd"> ignore_eos (bool): Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. Defaults to False.</span>
|
|
<span class="sd"> detokenize (bool): Whether to detokenize the output. Defaults to True.</span>
|
|
<span class="sd"> add_special_tokens (bool): Whether to add special tokens to the prompt. Defaults to True.</span>
|
|
<span class="sd"> truncate_prompt_tokens (int, optional): If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None.</span>
|
|
<span class="sd"> skip_special_tokens (bool): Whether to skip special tokens in the output. Defaults to True.</span>
|
|
<span class="sd"> spaces_between_special_tokens (bool): Whether to add spaces between special tokens in the output. Defaults to True.</span>
|
|
<span class="sd"> """</span>
|
|
<span class="c1"># [TO DEVELOPER] This class provides an interface to LLMAPI users.</span>
|
|
<span class="c1"># Internally, it manages and dispatches fields to Python bindings of C++ objects, currently including:</span>
|
|
<span class="c1"># (1) all fields of tllme.SamplingConfig;</span>
|
|
<span class="c1"># (2) all fields of tllme.OutputConfig;</span>
|
|
<span class="c1"># (3) some fields of tllme.Request.</span>
|
|
<span class="c1"># If you changed the implementation of C++ objects and corresponding Python bindings, please update:</span>
|
|
<span class="c1"># (1) the fields and corresponding docstring of this class, and</span>
|
|
<span class="c1"># (2) the expected_fields defined in _get_xxx_config methods.</span>
|
|
|
|
<span class="n">end_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">pad_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">max_tokens</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">32</span>
|
|
<span class="n">max_new_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
|
|
<span class="n">bad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">bad_token_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">_bad_word_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
|
<span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
|
<span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
|
<span class="n">stop</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">stop_token_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">include_stop_str_in_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">_stop_word_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
|
<span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
|
<span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
|
|
|
<span class="n">embedding_bias</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">external_draft_tokens_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
|
|
<span class="n">tllme</span><span class="o">.</span><span class="n">ExternalDraftTokensConfig</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">logits_post_processor_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
|
|
<span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
|
|
<span class="n">best_of</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">use_beam_search</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
|
|
<span class="c1"># Keep the below fields in sync with tllme.SamplingConfig or maintin the mapping table.</span>
|
|
<span class="n">beam_width</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
|
|
<span class="n">num_return_sequences</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">top_k</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">top_p</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">top_p_min</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">top_p_reset_ids</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">top_p_decay</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">temperature</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">min_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">min_length</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">beam_search_diversity_rate</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">repetition_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">presence_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">frequency_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">length_penalty</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">early_stopping</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">no_repeat_ngram_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
|
|
<span class="c1"># Keep the below fields in sync with tllme.OutputConfig</span>
|
|
<span class="n">return_log_probs</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">return_context_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">return_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">exclude_input_from_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
|
<span class="n">return_encoder_output</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">return_perf_metrics</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">additional_model_outputs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">list</span><span class="p">[</span><span class="n">AdditionalModelOutput</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
|
|
<span class="c1"># Lookahead decoding config</span>
|
|
<span class="n">lookahead_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">tllme</span><span class="o">.</span><span class="n">LookaheadDecodingConfig</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
|
|
<span class="c1"># Guided decoding params</span>
|
|
<span class="n">guided_decoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">GuidedDecodingParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
|
|
<span class="c1"># Tokenizer-related configs</span>
|
|
<span class="n">ignore_eos</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
|
<span class="n">detokenize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
|
<span class="n">add_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
|
<span class="n">truncate_prompt_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
|
<span class="n">skip_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
|
<span class="n">spaces_between_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
|
|
|
<span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_id</span>
|
|
|
|
<span class="c1"># Handle the compatibility between OpenAI and HF style-parameters.</span>
|
|
<span class="n">hf_style</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">beam_width</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_return_sequences</span>
|
|
<span class="n">openai_style</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span>
|
|
|
|
<span class="k">if</span> <span class="n">hf_style</span> <span class="ow">and</span> <span class="n">openai_style</span><span class="p">:</span>
|
|
<span class="n">ambiguous_params</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s1">'beam_width'</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">beam_width</span><span class="p">,</span>
|
|
<span class="s1">'num_return_sequences'</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_return_sequences</span><span class="p">,</span>
|
|
<span class="s1">'n'</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">,</span>
|
|
<span class="s1">'best_of'</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="p">,</span>
|
|
<span class="s1">'use_beam_search'</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span><span class="p">,</span>
|
|
<span class="p">}</span>
|
|
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
|
<span class="s1">'Got ambiguous parameters. Please specify either Hugging Face '</span>
|
|
<span class="s1">'style parameters (beam_width or num_return_sequences) or '</span>
|
|
<span class="s1">'OpenAI style parameters (n, best_of, or use_beam_search), '</span>
|
|
<span class="sa">f</span><span class="s1">'but not both: </span><span class="si">{</span><span class="n">ambiguous_params</span><span class="si">}</span><span class="s1">. It is recommended to use '</span>
|
|
<span class="s1">'OpenAI style parameters (n, best_of, use_beam_search).'</span><span class="p">)</span>
|
|
|
|
<span class="k">if</span> <span class="n">hf_style</span><span class="p">:</span>
|
|
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
|
<span class="s2">"Please use 'n' and 'best_of' for the LLM API. The use of "</span>
|
|
<span class="s2">"'beam_width' and 'num_return_sequences' will be deprecated "</span>
|
|
<span class="s2">"in a future release."</span><span class="p">)</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">n</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">beam_width</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_return_sequences</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">beam_width</span> <span class="o">></span> <span class="mi">1</span>
|
|
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span>
|
|
|
|
<span class="k">if</span> <span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span>
|
|
<span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">return_log_probs</span><span class="p">):</span>
|
|
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s2">"Enable 'return_log_probs' to trim the </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">-best among "</span>
|
|
<span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2"> outputs under sampling decoding."</span><span class="p">)</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">return_log_probs</span> <span class="o">=</span> <span class="kc">True</span>
|
|
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">_validate</span><span class="p">()</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
|
<span class="w"> </span><span class="sd">''' Verify the sampling parameters.</span>
|
|
|
|
<span class="sd"> This function verifies the sampling parameters in the LLM API, which</span>
|
|
<span class="sd"> may have stricter requirements than the Executor class of C++ runtime.</span>
|
|
<span class="sd"> For instance, while the greedy decoding with n > 1 is capable in the</span>
|
|
<span class="sd"> Executor class of C++ runtime, the LLM API disallows such combination.</span>
|
|
<span class="sd"> '''</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">:</span>
|
|
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s1">'In beam search, beam_width (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">beam_width</span><span class="si">}</span><span class="s1">) must be '</span>
|
|
<span class="sa">f</span><span class="s1">'greater than or equal to num_return_sequences '</span>
|
|
<span class="sa">f</span><span class="s1">'(</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">num_return_sequences</span><span class="si">}</span><span class="s1">).'</span><span class="p">)</span>
|
|
|
|
<span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">greedy_decoding</span> <span class="ow">and</span>
|
|
<span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'TLLM_ALLOW_N_GREEDY_DECODING'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)):</span>
|
|
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s1">'Greedy decoding in the LLM API does not allow multiple '</span>
|
|
<span class="sa">f</span><span class="s1">'returns. Please set to best_of=1, got best_of=</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">. '</span>
|
|
<span class="sa">f</span><span class="s1">'Please set to best_of=1 or set an environment variable '</span>
|
|
<span class="sa">f</span><span class="s1">'TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of > 1 '</span>
|
|
<span class="sa">f</span><span class="s1">'under the greedy decoding.'</span><span class="p">)</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="o"><</span> <span class="mi">1</span><span class="p">:</span>
|
|
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s2">"truncate_prompt_tokens must be >= 1, got </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span><span class="si">}</span><span class="s2">"</span>
|
|
<span class="p">)</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">_validate</span><span class="p">()</span>
|
|
|
|
<span class="nd">@property</span>
|
|
<span class="k">def</span> <span class="nf">greedy_decoding</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span>
|
|
<span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span>
|
|
<span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">top_p</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_p</span> <span class="o">==</span> <span class="mf">0.0</span><span class="p">))</span>
|
|
|
|
<div class="viewcode-block" id="SamplingParams.setup">
|
|
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.setup">[docs]</a>
|
|
<span class="k">def</span> <span class="nf">setup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
|
<span class="n">tokenizer</span><span class="p">,</span>
|
|
<span class="n">add_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="s1">'SamplingParams'</span><span class="p">:</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">end_id</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">eos_token_id</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">end_id</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="n">strs</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">bad</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bad</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">_bad_word_ids</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="o">=</span><span class="n">add_special_tokens</span><span class="p">)</span>
|
|
<span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">strs</span>
|
|
<span class="p">]</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="n">strs</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="o">=</span><span class="n">add_special_tokens</span><span class="p">)</span>
|
|
<span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">strs</span>
|
|
<span class="p">]</span>
|
|
|
|
<span class="k">return</span> <span class="bp">self</span></div>
|
|
|
|
|
|
<span class="k">def</span> <span class="nf">_get_bad_words</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span>
|
|
<span class="n">words</span> <span class="o">=</span> <span class="p">[]</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad_token_ids</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="n">words</span> <span class="o">=</span> <span class="p">[[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad_token_ids</span><span class="p">]</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">bad</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="n">words</span>
|
|
<span class="k">else</span><span class="p">:</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_bad_word_ids</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.bad (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">bad</span><span class="si">}</span><span class="s2">) is not processed by tokenizer, "</span>
|
|
<span class="s2">"please call the setup method."</span><span class="p">)</span>
|
|
<span class="k">return</span> <span class="n">words</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_bad_word_ids</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_get_stop_words</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span>
|
|
<span class="n">words</span> <span class="o">=</span> <span class="p">[]</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="n">words</span> <span class="o">=</span> <span class="p">[[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span><span class="p">]</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="n">words</span>
|
|
<span class="k">else</span><span class="p">:</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.stop (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="si">}</span><span class="s2">) is not processed by tokenizer, "</span>
|
|
<span class="s2">"please call the setup method."</span><span class="p">)</span>
|
|
<span class="k">return</span> <span class="n">words</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_get_stop_reasons_and_words</span><span class="p">(</span>
|
|
<span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]:</span>
|
|
<span class="n">stop_reasons</span> <span class="o">=</span> <span class="p">[]</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="n">stop_reasons</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span><span class="p">)</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
|
|
<span class="n">stop_reasons</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">)</span>
|
|
<span class="k">else</span><span class="p">:</span>
|
|
<span class="n">stop_reasons</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">)</span>
|
|
<span class="n">stop_words</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_stop_words</span><span class="p">()</span>
|
|
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">stop_reasons</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">stop_words</span><span class="p">):</span>
|
|
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
|
|
<span class="sa">f</span><span class="s2">"The number of </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.stop_token_ids (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">stop_token_ids</span><span class="si">}</span><span class="s2">) "</span>
|
|
<span class="sa">f</span><span class="s2">"and </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.stop (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="si">}</span><span class="s2">) are inconsistent with the "</span>
|
|
<span class="sa">f</span><span class="s2">"processed stop_words (</span><span class="si">{</span><span class="n">stop_words</span><span class="si">}</span><span class="s2">)."</span><span class="p">)</span>
|
|
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">stop_reasons</span><span class="p">,</span> <span class="n">stop_words</span><span class="p">))</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_get_sampling_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">tllme</span><span class="o">.</span><span class="n">SamplingConfig</span><span class="p">:</span>
|
|
<span class="n">expected_fields</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s2">"beam_width"</span><span class="p">,</span> <span class="s2">"top_k"</span><span class="p">,</span> <span class="s2">"top_p"</span><span class="p">,</span> <span class="s2">"top_p_min"</span><span class="p">,</span> <span class="s2">"top_p_reset_ids"</span><span class="p">,</span>
|
|
<span class="s2">"top_p_decay"</span><span class="p">,</span> <span class="s2">"seed"</span><span class="p">,</span> <span class="s2">"random_seed"</span><span class="p">,</span> <span class="s2">"temperature"</span><span class="p">,</span> <span class="s2">"min_tokens"</span><span class="p">,</span>
|
|
<span class="s2">"min_length"</span><span class="p">,</span> <span class="s2">"beam_search_diversity_rate"</span><span class="p">,</span> <span class="s2">"repetition_penalty"</span><span class="p">,</span>
|
|
<span class="s2">"presence_penalty"</span><span class="p">,</span> <span class="s2">"frequency_penalty"</span><span class="p">,</span> <span class="s2">"length_penalty"</span><span class="p">,</span>
|
|
<span class="s2">"early_stopping"</span><span class="p">,</span> <span class="s2">"no_repeat_ngram_size"</span><span class="p">,</span> <span class="s2">"num_return_sequences"</span>
|
|
<span class="p">}</span>
|
|
<span class="n">found_fields</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="n">f</span>
|
|
<span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">tllme</span><span class="o">.</span><span class="n">SamplingConfig</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">f</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s1">'__'</span><span class="p">)</span>
|
|
<span class="p">}</span>
|
|
|
|
<span class="k">if</span> <span class="n">found_fields</span> <span class="o">!=</span> <span class="n">expected_fields</span><span class="p">:</span>
|
|
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
|
|
<span class="s2">"Found fields in `tllme.SamplingConfig` different than expected; "</span>
|
|
<span class="sa">f</span><span class="s2">"if `tllme.SamplingConfig` is changed, please update </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2"> accordingly. "</span>
|
|
<span class="s2">"See [TO DEVELOPER] comments for detailed instructions."</span><span class="p">)</span>
|
|
|
|
<span class="c1"># A map from the SamplingConfig fields of the LLM API to their</span>
|
|
<span class="c1"># corresponding field names of the Executor of TRT-LLM C++ runtime.</span>
|
|
<span class="c1"># In sampling, there is no parameter that directly matches 'best_of',</span>
|
|
<span class="c1"># so outputs must be trimmed during postprocessing.</span>
|
|
<span class="c1"># | LLM API | TRT-LLM Executor |</span>
|
|
<span class="c1"># --------------|-----------------|------------------------|</span>
|
|
<span class="c1"># | Beam search | use_beam_search | beam_width > 1 |</span>
|
|
<span class="c1"># | Beam search | n | num_return_sequences |</span>
|
|
<span class="c1"># | Beam search | best_of | beam_width |</span>
|
|
<span class="c1"># |-------------|-----------------|------------------------|</span>
|
|
<span class="c1"># | Sampling | use_beam_search | beam_width == 1 |</span>
|
|
<span class="c1"># | Sampling | n | num_return_sequences |</span>
|
|
<span class="c1"># | Sampling | best_of | no corresponding param |</span>
|
|
<span class="n">unmatched_params</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="s1">'num_return_sequences'</span><span class="p">,</span> <span class="s1">'beam_width'</span><span class="p">,</span> <span class="s1">'n'</span><span class="p">,</span> <span class="s1">'best_of'</span><span class="p">,</span>
|
|
<span class="s1">'use_beam_search'</span>
|
|
<span class="p">]</span>
|
|
<span class="n">llmapi_to_rt_param_map</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="n">f</span><span class="p">:</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span>
|
|
<span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">expected_fields</span> <span class="k">if</span> <span class="n">f</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">unmatched_params</span>
|
|
<span class="p">}</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span><span class="p">:</span>
|
|
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s1">'num_return_sequences'</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span>
|
|
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s1">'beam_width'</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span>
|
|
<span class="k">else</span><span class="p">:</span>
|
|
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s1">'num_return_sequences'</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span>
|
|
<span class="n">llmapi_to_rt_param_map</span><span class="p">[</span><span class="s1">'beam_width'</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
|
|
|
|
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">SamplingConfig</span><span class="p">(</span><span class="o">**</span><span class="n">llmapi_to_rt_param_map</span><span class="p">)</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_get_output_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">tllme</span><span class="o">.</span><span class="n">OutputConfig</span><span class="p">:</span>
|
|
<span class="n">expected_fields</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="s2">"return_log_probs"</span><span class="p">,</span> <span class="s2">"return_context_logits"</span><span class="p">,</span>
|
|
<span class="s2">"return_generation_logits"</span><span class="p">,</span> <span class="s2">"exclude_input_from_output"</span><span class="p">,</span>
|
|
<span class="s2">"return_encoder_output"</span><span class="p">,</span> <span class="s2">"return_perf_metrics"</span><span class="p">,</span>
|
|
<span class="s2">"additional_model_outputs"</span>
|
|
<span class="p">]</span>
|
|
<span class="n">found_fields</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">tllme</span><span class="o">.</span><span class="n">OutputConfig</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">f</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s1">'__'</span><span class="p">)</span>
|
|
<span class="p">]</span>
|
|
<span class="k">if</span> <span class="nb">set</span><span class="p">(</span><span class="n">found_fields</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">set</span><span class="p">(</span><span class="n">expected_fields</span><span class="p">):</span>
|
|
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span>
|
|
<span class="s2">"Found fields in `tllme.OutputConfig` different than expected; "</span>
|
|
<span class="sa">f</span><span class="s2">"if `tllme.OutputConfig` is changed, please update </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2"> accordingly. "</span>
|
|
<span class="s2">"See [TO DEVELOPER] comments for detailed instructions."</span><span class="p">)</span>
|
|
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">OutputConfig</span><span class="p">(</span>
|
|
<span class="o">**</span><span class="p">{</span><span class="n">f</span><span class="p">:</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span>
|
|
<span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">expected_fields</span><span class="p">})</span>
|
|
|
|
<span class="k">def</span> <span class="nf">_get_guided_decoding_params</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">:</span>
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">num_guides</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="kc">None</span>
|
|
|
|
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json_object</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
|
|
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">JSON</span><span class="p">)</span>
|
|
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="n">json_schema</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json</span>
|
|
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">):</span>
|
|
<span class="n">json_schema</span> <span class="o">=</span> <span class="n">json_schema</span><span class="o">.</span><span class="n">model_json_schema</span><span class="p">()</span>
|
|
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json_schema</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
|
|
<span class="n">json_schema</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">json_schema</span><span class="p">)</span>
|
|
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
|
|
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">JSON_SCHEMA</span><span class="p">,</span> <span class="n">json_schema</span><span class="p">)</span>
|
|
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">regex</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
|
|
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">REGEX</span><span class="p">,</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">regex</span><span class="p">)</span>
|
|
<span class="k">else</span><span class="p">:</span>
|
|
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
|
|
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">EBNF_GRAMMAR</span><span class="p">,</span>
|
|
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">grammar</span><span class="p">)</span></div>
|
|
|
|
</pre></div>
|
|
|
|
</div>
|
|
</div>
|
|
<footer>
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<jinja2.runtime.BlockReference object at 0x7f5e6e3542f0>
|
|
|
|
<div class="footer">
|
|
<p>
|
|
Copyright © 2024 NVIDIA Corporation
|
|
</p>
|
|
<p>
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Privacy Policy</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Manage My Privacy</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
|
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Accessibility</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
|
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Product Security</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Contact</a>
|
|
</p>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</footer>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
</div>
|
|
<script>
|
|
jQuery(function () {
|
|
SphinxRtdTheme.Navigation.enable(true);
|
|
});
|
|
</script>
|
|
|
|
</body>
|
|
</html> |