TensorRT-LLMs/advanced/speculative-decoding.html



<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Speculative Sampling &mdash; tensorrt_llm  documentation</title>
      <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
      <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
      <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />


      <script src="../_static/jquery.js?v=5d32c60e"></script>
      <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../_static/doctools.js?v=9a2dae69"></script>
      <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
      <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
      <script src="../_static/copybutton.js?v=65e89d2a"></script>
    <script src="../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="Disaggregated-Service (experimental)" href="disaggregated-service.html" />
    <link rel="prev" title="KV cache reuse" href="kv-cache-reuse.html" />
</head>

<body class="wy-body-for-nav">
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >


          <a href="../index.html" class="icon icon-home">
            tensorrt_llm
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="inference-request.html">Inference Request</a></li>
<li class="toctree-l1"><a class="reference internal" href="inference-request.html#responses">Responses</a></li>
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Speculative Sampling</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#about-speculative-sampling">About Speculative Sampling</a></li>
<li class="toctree-l2"><a class="reference internal" href="#performance-improvements">Performance Improvements</a></li>
<li class="toctree-l2"><a class="reference internal" href="#draft-target-model">Draft-Target-Model</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#using-draft-target-model-approach-with-triton-inference-server">Using Draft-Target-Model approach with Triton Inference Server</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#prompt-lookup-decoding">Prompt-Lookup-Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="#medusa">Medusa</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#medusa-tree">Medusa Tree</a></li>
<li class="toctree-l3"><a class="reference internal" href="#using-medusa-with-tensorrt-llm">Using Medusa with TensorRT-LLM</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#limitations">Limitations</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#redrafter">ReDrafter</a></li>
<li class="toctree-l2"><a class="reference internal" href="#eagle">EAGLE</a></li>
<li class="toctree-l2"><a class="reference internal" href="#lookahead-decoding">Lookahead Decoding</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-best-practices.html">Best Practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../index.html">tensorrt_llm</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Speculative Sampling</li>
      <li class="wy-breadcrumbs-aside">
            <a href="../_sources/advanced/speculative-decoding.md.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">

  <section id="speculative-sampling">
<h1>Speculative Sampling<a class="headerlink" href="#speculative-sampling" title="Link to this heading"></a></h1>
<ul class="simple">
<li><p><a class="reference internal" href="#about-speculative-sampling">About Speculative Sampling</a></p></li>
<li><p><a class="reference internal" href="#Performance-improvements"><span class="xref myst">Performance Improvements</span></a></p></li>
<li><p><a class="reference internal" href="#Draft-Target-Model"><span class="xref myst">Draft-Target-Model</span></a></p>
<ul>
<li><p><a class="reference internal" href="#Using-Draft-model-approach-with-Triton-Inference-Server"><span class="xref myst">Using Draft model approach with Triton Inference Server</span></a></p></li>
</ul>
</li>
<li><p><a class="reference internal" href="#prompt-lookup-decoding">Prompt-Lookup-Decoding</a></p></li>
<li><p><a class="reference internal" href="#medusa">Medusa</a></p>
<ul>
<li><p><a class="reference internal" href="#medusa-tree">Medusa Tree</a></p></li>
<li><p><a class="reference internal" href="#using-medusa-with-tensorrt-llm">Using Medusa with TensorRT-LLM</a></p>
<ul>
<li><p><a class="reference internal" href="#limitations">Limitations</a></p></li>
</ul>
</li>
</ul>
</li>
<li><p><a class="reference internal" href="#redrafter">ReDrafter</a></p></li>
<li><p><a class="reference internal" href="#eagle">EAGLE</a></p></li>
<li><p><a class="reference internal" href="#lookahead-decoding">Lookahead decoding</a></p></li>
</ul>
<section id="about-speculative-sampling">
<h2>About Speculative Sampling<a class="headerlink" href="#about-speculative-sampling" title="Link to this heading"></a></h2>
<p>Speculative Sampling (also referred to as Speculative Decoding) is a set of techniques designed to allow generation of more than one token per forward pass iteration. This can lead to a reduction in the average per-token latency <strong>in situations where the GPU
is underutilized due to small batch sizes.</strong></p>
<p>Speculative Sampling involves predicting a sequence of future tokens, referred to as draft tokens, using a method
that is substantially more efficient than repeatedly executing the target Large Language Model (LLM).
These draft tokens are then collectively validated by processing them through the target LLM in a single forward pass.
The underlying assumptions are twofold:</p>
<ol class="arabic simple">
<li><p>processing multiple draft tokens concurrently will be as rapid as processing a single token</p></li>
<li><p>multiple draft tokens will be validated successfully over the course of the full generation</p></li>
</ol>
<p>If the first assumption holds true, the latency of speculative decoding will no worse than the standard approach. If the second holds, output token generation advances by statistically more than one token per forward pass.
The combination of both these allows speculative decoding to result in reduced latency.</p>
<p>TensorRT-LLM supports several approaches for generating draft tokens, including:</p>
<ol class="arabic simple">
<li><p>Utilizing a smaller, auxiliary model, known as the draft model approach. For more information, refer to the <a class="reference external" href="https://arxiv.org/pdf/2211.17192.pdf">Fast Inference from Transformers via Speculative Decoding paper</a>.</p></li>
<li><p>Implementing additional language model heads that predict tokens for future positions:</p>
<ol class="arabic simple">
<li><p><a class="reference external" href="https://arxiv.org/abs/2401.10774">Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper</a>.</p></li>
<li><p><a class="reference external" href="https://arxiv.org/html/2403.09919v1">Recurrent Drafter for Fast Speculative Decoding in Large Language Models</a>.</p></li>
<li><p><a class="reference external" href="https://arxiv.org/pdf/2401.15077">EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty</a>.</p></li>
</ol>
</li>
<li><p>Utilizing prompt tokens as draft tokens. For more information, refer to <a class="reference external" href="https://github.com/apoorvumang/prompt-lookup-decoding/">Prompt Lookup Decoding</a>.</p></li>
<li><p>Utilizing Jacobi-like decoding to predict and verify draft tokens using the same model which does not need additional fine-tuning. Refer to <a class="reference external" href="https://arxiv.org/pdf/2402.02057">Break the Sequential Dependency of LLM Inference Using Lookahead Decoding</a>.</p></li>
</ol>
</section>
<section id="performance-improvements">
<h2>Performance Improvements<a class="headerlink" href="#performance-improvements" title="Link to this heading"></a></h2>
<p>It’s important to note that the effectiveness of speculative decoding techniques is highly dependent
on the specific task at hand. For instance, forecasting subsequent tokens in a code-completion scenario
may prove simpler than generating a summary for an article.</p>
<p>Furthermore, when integrating Medusa with a standard PyTorch model implementation which may not be as finely
tuned as TensorRT-LLM, the potential time savings are more pronounced.</p>
</section>
<section id="draft-target-model">
<h2>Draft-Target-Model<a class="headerlink" href="#draft-target-model" title="Link to this heading"></a></h2>
<p>The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model.</p>
<p>There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in this document. The second one is using it directly in TensorRT-LLM, which steps can be found in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/draft_target_model/README.md">examples/draft_target_model/README.md</a> and the code can be found in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/run_dtm_pld.py">examples/prompt_lookup/run_dtm_pld.py</a>.</p>
<p>The management of Draft and Target models is facilitated through two separate <code class="docutils literal notranslate"><span class="pre">GptManager</span></code> instances.
It is essential that you to coordinate the interactions between the Draft and Target models effectively.
Initially, the Draft model is queried to generate up to <code class="docutils literal notranslate"><span class="pre">K</span></code> draft tokens.
These tokens are then forwarded to the Target model for verification.
Upon verification, the Target model may return up to <code class="docutils literal notranslate"><span class="pre">K+1</span></code> tokens.
Subsequently, the prompt, now updated with the accepted tokens, is sent back to the Draft model to initiate the generation of new draft tokens.
This iterative process continues until a predefined stop conditions are met.
An example of this orchestration process can be found in the <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py">TensorRT-LLM Triton backend</a>.</p>
<p>Configuring and executing the Draft model within the Inflight Fused Batching (IFB) framework
follows the same procedure as for any other model within IFB.
The <code class="docutils literal notranslate"><span class="pre">maxNewTokens</span></code> parameter should be set to the number of draft tokens in the <code class="docutils literal notranslate"><span class="pre">LlmRequest</span></code> for the Draft model query.</p>
<p>When building the Target model, it is necessary to specify the <code class="docutils literal notranslate"><span class="pre">--max_draft_len</span> <span class="pre">&lt;K&gt;</span> <span class="pre">--speculative_decoding_mode</span> <span class="pre">draft_tokens_external</span></code> option to the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.
During the Target model’s inference phase in IFB, <code class="docutils literal notranslate"><span class="pre">maxNewTokens</span></code> should be set to <code class="docutils literal notranslate"><span class="pre">1</span></code>,
and the draft tokens must be set in the <code class="docutils literal notranslate"><span class="pre">draftTokens</span></code> field of the <code class="docutils literal notranslate"><span class="pre">LlmRequest</span></code> for the Target model query.</p>
<p><strong>NOTE:</strong> To enhance performance, especially due to the repetitive querying of Draft and Target models with requests that share a common prefix,
it is advisable to enable KV cache reuse for both models.
This can be achieved by adding the <code class="docutils literal notranslate"><span class="pre">--use_paged_context_fmha=enable</span></code> flag to the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command
and setting <code class="docutils literal notranslate"><span class="pre">enableBlockReuse=true</span></code> in the <code class="docutils literal notranslate"><span class="pre">KVCacheConfig</span></code>.</p>
<section id="using-draft-target-model-approach-with-triton-inference-server">
<h3>Using Draft-Target-Model approach with Triton Inference Server<a class="headerlink" href="#using-draft-target-model-approach-with-triton-inference-server" title="Link to this heading"></a></h3>
<p>This example is only relevant for Draft-Target-Model model method. For all other speculative decoding models, you can deploy them in Triton server in the same way as standard non-speculative autoregressive models.</p>
<ul class="simple">
<li><p>Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with <a class="reference external" href="https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting">Business Logic Scripting</a>, BLS).</p></li>
<li><p>The source file of Draft model with BLS can be found <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py">here</a>.</p></li>
<li><p>This example is based on TensorRT-LLM-0.10.0 and TRTLLM-backend-0.10.0, using docker image <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/tritonserver:24.05-trtllm-py3</span></code>.</p></li>
<li><p>Llama-7B-hf and Llama-30B-hf are used as draft and target model respectively in this example, assuming the paths to the models’ repository are <code class="docutils literal notranslate"><span class="pre">DRAFT_MODEL_PATH</span></code> and <code class="docutils literal notranslate"><span class="pre">TARGET_MODEL_PATH</span></code>.</p></li>
<li><p>Maximum number of draft tokens is set to 10 in this example.</p></li>
</ul>
<ol class="arabic">
<li><p>Prepare TensorRT engine for inference</p>
<ul class="simple">
<li><p>Here are the commands to build draft / target engines in FP16 or FP8. All combinations of the data type (Draft-FP16/FP8 + Target-FP16/FP8) are supported.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">--remove_input_padding=enable</span> <span class="pre">--paged_kv_cache=enable</span></code> are necessary for inflight-batching.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">--context_fmha=enable</span> <span class="pre">--use_paged_context_fmha=enable</span></code> are optional, but recommended for the performance.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">--gather_generation_logits</span></code> is necessary if using generation logits for selecting tokens in target model.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">--tp_size</span></code> can be modified set if using TP mode for draft / target model.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">--max_batch_size</span></code> more than 1 is acceptable in general usage, but we use 1 in this example.</p></li>
</ul>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">MAX_DRAFT_LENGTH</span><span class="o">=</span><span class="m">10</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">COMMON_COMMAND</span><span class="o">=</span><span class="s2">&quot;--max_batch_size=1 --max_input_len=2048 --max_seq_len=3072 --gpt_attention_plugin=float16 --gemm_plugin=float16 --remove_input_padding=enable --paged_kv_cache=enable --context_fmha=enable --use_paged_context_fmha=enable --gather_generation_logits&quot;</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">DRAFT_COMMAND_FP16</span><span class="o">=</span><span class="s2">&quot;</span><span class="nv">$COMMON_COMMAND</span><span class="s2">&quot;</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">TARGET_COMMAND_FP16</span><span class="o">=</span><span class="s2">&quot;</span><span class="nv">$DRAFT_COMMAND_FP16</span><span class="s2"> --max_draft_len=</span><span class="nv">$MAX_DRAFT_LENGTH</span><span class="s2"> --speculative_decoding_mode draft_tokens_external&quot;</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">DRAFT_COMMAND_FP8</span><span class="o">=</span><span class="s2">&quot;</span><span class="nv">$COMMON_COMMAND</span><span class="s2"> --use_fp8_context_fmha=enable&quot;</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">TARGET_COMMAND_FP8</span><span class="o">=</span><span class="s2">&quot;</span><span class="nv">$DRAFT_COMMAND_FP8</span><span class="s2"> --max_draft_len=</span><span class="nv">$MAX_DRAFT_LENGTH</span><span class="s2"> --speculative_decoding_mode draft_tokens_external&quot;</span>

<span class="c1"># Build checkpoints and engines in tensorrt_llm/examples/llama/</span>
<span class="c1"># FP16 mode</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">DRAFT_NAME</span><span class="o">=</span>llama-7b-fp16-tp1
<span class="nb">export</span><span class="w"> </span><span class="nv">TARGET_NAME</span><span class="o">=</span>llama-30b-fp16-tp1
python3<span class="w"> </span>convert_checkpoint.py<span class="w"> </span>--model_dir<span class="o">=</span><span class="nv">$DRAFT_MODEL_PATH</span><span class="w"> </span>--output_dir<span class="o">=</span>ckpt/<span class="nv">$DRAFT_NAME</span><span class="w"> </span>--tp_size<span class="o">=</span><span class="m">1</span>
python3<span class="w"> </span>convert_checkpoint.py<span class="w"> </span>--model_dir<span class="o">=</span><span class="nv">$TARGET_MODEL_PATH</span><span class="w"> </span>--output_dir<span class="o">=</span>ckpt/<span class="nv">$TARGET_NAME</span><span class="w"> </span>--tp_size<span class="o">=</span><span class="m">1</span>
trtllm-build<span class="w"> </span>--checkpoint_dir<span class="o">=</span>ckpt/<span class="nv">$DRAFT_NAME</span><span class="w"> </span>--output_dir<span class="o">=</span>engine/draft/<span class="nv">$DRAFT_NAME</span><span class="w"> </span><span class="nv">$DRAFT_COMMAND_FP16</span>
trtllm-build<span class="w"> </span>--checkpoint_dir<span class="o">=</span>ckpt/<span class="nv">$TARGET_NAME</span><span class="w"> </span>--output_dir<span class="o">=</span>engine/target/<span class="nv">$TARGET_NAME</span><span class="w"> </span><span class="nv">$TARGET_COMMAND_FP16</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">DRAFT_ENGINE_PATH</span><span class="o">=</span><span class="k">$(</span><span class="nb">pwd</span><span class="k">)</span>/engine/draft/<span class="nv">$DRAFT_NAME</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">TARGET_ENGINE_PATH</span><span class="o">=</span><span class="k">$(</span><span class="nb">pwd</span><span class="k">)</span>/engine/target/<span class="nv">$TARGET_NAME</span>

<span class="c1"># FP8 mode</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">DRAFT_NAME</span><span class="o">=</span>llama-7b-fp8-tp1
<span class="nb">export</span><span class="w"> </span><span class="nv">TARGET_NAME</span><span class="o">=</span>llama-30b-fp8-tp1
python3<span class="w"> </span>../quantization/quantize.py<span class="w"> </span>--model_dir<span class="o">=</span><span class="nv">$DRAFT_MODEL_PATH</span><span class="w"> </span>--dtype<span class="w"> </span>float16<span class="w"> </span>--qformat<span class="w"> </span>fp8<span class="w"> </span>--kv_cache_dtype<span class="w"> </span>fp8<span class="w"> </span>--output_dir<span class="o">=</span>ckpt/<span class="nv">$DRAFT_NAME</span><span class="w"> </span>--tp_size<span class="o">=</span><span class="m">1</span>
python3<span class="w"> </span>../quantization/quantize.py<span class="w"> </span>--model_dir<span class="o">=</span><span class="nv">$TARGET_MODEL_PATH</span><span class="w"> </span>--dtype<span class="w"> </span>float16<span class="w"> </span>--qformat<span class="w"> </span>fp8<span class="w"> </span>--kv_cache_dtype<span class="w"> </span>fp8<span class="w"> </span>--output_dir<span class="o">=</span>ckpt/<span class="nv">$TARGET_NAME</span><span class="w"> </span>--tp_size<span class="o">=</span><span class="m">1</span>
trtllm-build<span class="w"> </span>--checkpoint_dir<span class="o">=</span>ckpt/<span class="nv">$DRAFT_NAME</span><span class="w"> </span>--output_dir<span class="o">=</span>engine/draft/<span class="nv">$DRAFT_NAME</span><span class="w"> </span><span class="nv">$DRAFT_COMMAND_FP8</span>
trtllm-build<span class="w"> </span>--checkpoint_dir<span class="o">=</span>ckpt/<span class="nv">$TARGET_NAME</span><span class="w"> </span>--output_dir<span class="o">=</span>engine/target/<span class="nv">$TARGET_NAME</span><span class="w"> </span><span class="nv">$TARGET_COMMAND_FP8</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">DRAFT_ENGINE_PATH</span><span class="o">=</span><span class="k">$(</span><span class="nb">pwd</span><span class="k">)</span>/engine/draft/<span class="nv">$DRAFT_NAME</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">TARGET_ENGINE_PATH</span><span class="o">=</span><span class="k">$(</span><span class="nb">pwd</span><span class="k">)</span>/engine/target/<span class="nv">$TARGET_NAME</span>
</pre></div>
</div>
</li>
<li><p>Edit Triton configuration</p>
<ul class="simple">
<li><p>If both draft and target model can be placed in one GPU (for example, llama-7B-FP8 + llama-30B-FP8, totally 40GiB in one H100-80GiB GPU), <code class="docutils literal notranslate"><span class="pre">DRAFT_GPU_DEVICE_IDS</span></code> and <code class="docutils literal notranslate"><span class="pre">TARGET_GPU_DEVICE_IDS</span></code> can be the same, <code class="docutils literal notranslate"><span class="pre">0</span></code> as example. It appears better performance than placing on two separate GPUs.</p></li>
<li><p>Elsewise, the draft and target models can be placed in different GPUs, <code class="docutils literal notranslate"><span class="pre">DRAFT_GPU_DEVICE_IDS=&quot;0&quot;</span></code> and <code class="docutils literal notranslate"><span class="pre">TARGET_GPU_DEVICE_IDS=&quot;1&quot;</span></code> as example.</p></li>
<li><p>Furthermore, if TP mode is used, the value of <code class="docutils literal notranslate"><span class="pre">GPU_DEVICE_IDS</span></code> can be a list, <code class="docutils literal notranslate"><span class="pre">DRAFT_GPU_DEVICE_IDS=&quot;0&quot;</span></code> and <code class="docutils literal notranslate"><span class="pre">TARGET_GPU_DEVICE_IDS=&quot;1,2,3,4&quot;</span></code> as example.</p></li>
<li><p>For more configuration of launching models with Tritonserver, please visit <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/README.md">TensorRT-LLM Backed repo</a>.</p></li>
</ul>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">ACCUMULATE_TOKEN</span><span class="o">=</span><span class="s2">&quot;false&quot;</span>
<span class="nv">BACKEND</span><span class="o">=</span><span class="s2">&quot;tensorrtllm&quot;</span>
<span class="nv">BATCH_SCHEDULER_POLICY</span><span class="o">=</span><span class="s2">&quot;guaranteed_no_evict&quot;</span>
<span class="nv">BATCHING_STRATEGY</span><span class="o">=</span><span class="s2">&quot;inflight_fused_batching&quot;</span>
<span class="nv">BLS_INSTANCE_COUNT</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
<span class="nv">DECODING_MODE</span><span class="o">=</span><span class="s2">&quot;top_k_top_p&quot;</span>
<span class="nv">DECOUPLED_MODE</span><span class="o">=</span><span class="s2">&quot;False&quot;</span>
<span class="nv">DRAFT_GPU_DEVICE_IDS</span><span class="o">=</span><span class="s2">&quot;0&quot;</span>
<span class="nv">E2E_MODEL_NAME</span><span class="o">=</span><span class="s2">&quot;ensemble&quot;</span>
<span class="nv">ENABLE_KV_CACHE_REUSE</span><span class="o">=</span><span class="s2">&quot;true&quot;</span>
<span class="nv">ENGINE_PATH</span><span class="o">=</span><span class="nv">$TARGET_ENGINE_PATH</span>
<span class="nv">EXCLUDE_INPUT_IN_OUTPUT</span><span class="o">=</span><span class="s2">&quot;false&quot;</span>
<span class="nv">KV_CACHE_FREE_GPU_MEM_FRACTION</span><span class="o">=</span><span class="s2">&quot;0.8&quot;</span>
<span class="nv">MAX_ATTENTION_WINDOW_SIZE</span><span class="o">=</span><span class="s2">&quot;&quot;</span>
<span class="nv">MAX_BEAM_WIDTH</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
<span class="nv">MAX_QUEUE_DELAY_MICROSECONDS</span><span class="o">=</span><span class="s2">&quot;0&quot;</span>
<span class="nv">MAX_TOKENS_IN_KV_CACHE</span><span class="o">=</span><span class="s2">&quot;&quot;</span>
<span class="nv">NORMALIZE_LOG_PROBS</span><span class="o">=</span><span class="s2">&quot;true&quot;</span>
<span class="nv">POSTPROCESSING_INSTANCE_COUNT</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
<span class="nv">PREPROCESSING_INSTANCE_COUNT</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
<span class="nv">TARGET_GPU_DEVICE_IDS</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
<span class="nv">TENSORRT_LLM_DRAFT_MODEL_NAME</span><span class="o">=</span><span class="s2">&quot;tensorrt_llm_draft&quot;</span>
<span class="nv">TENSORRT_LLM_MODEL_NAME</span><span class="o">=</span><span class="s2">&quot;tensorrt_llm&quot;</span>
<span class="nv">TOKENIZER_PATH</span><span class="o">=</span><span class="nv">$DRAFT_MODEL_PATH</span>
<span class="nv">TOKENIZER_TYPE</span><span class="o">=</span>llama
<span class="nv">TRITON_GRPC_PORT</span><span class="o">=</span><span class="s2">&quot;8001&quot;</span>
<span class="nv">TRITON_HTTP_PORT</span><span class="o">=</span><span class="s2">&quot;8000&quot;</span>
<span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="o">=</span><span class="s2">&quot;4&quot;</span>
<span class="nv">TRITON_METRICS_PORT</span><span class="o">=</span><span class="s2">&quot;8002&quot;</span>
<span class="nv">TRITON_REPO</span><span class="o">=</span><span class="s2">&quot;triton_repo&quot;</span>
<span class="nv">USE_DRAFT_LOGITS</span><span class="o">=</span><span class="s2">&quot;false&quot;</span>

<span class="c1"># Make a copy of triton repo and replace the fields in the configuration files</span>
<span class="nb">cd</span><span class="w"> </span>/tensorrtllm_backend/
apt-get<span class="w"> </span>update<span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span>apt-get<span class="w"> </span>install<span class="w"> </span>-y<span class="w"> </span>build-essential<span class="w"> </span>cmake<span class="w"> </span>git-lfs
pip3<span class="w"> </span>install<span class="w"> </span>git-lfs<span class="w"> </span>tritonclient<span class="w"> </span>grpcio
rm<span class="w"> </span>-rf<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>
cp<span class="w"> </span>-R<span class="w"> </span>all_models/inflight_batcher_llm<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>
python3<span class="w"> </span>tools/fill_template.py<span class="w"> </span>-i<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/ensemble/config.pbtxt<span class="w"> </span>triton_max_batch_size:<span class="si">${</span><span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="si">}</span>
python3<span class="w"> </span>tools/fill_template.py<span class="w"> </span>-i<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/preprocessing/config.pbtxt<span class="w"> </span>tokenizer_dir:<span class="si">${</span><span class="nv">TOKENIZER_PATH</span><span class="si">}</span>,triton_max_batch_size:<span class="si">${</span><span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="si">}</span>,preprocessing_instance_count:<span class="si">${</span><span class="nv">PREPROCESSING_INSTANCE_COUNT</span><span class="si">}</span>
python3<span class="w"> </span>tools/fill_template.py<span class="w"> </span>-i<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/postprocessing/config.pbtxt<span class="w"> </span>tokenizer_dir:<span class="si">${</span><span class="nv">TOKENIZER_PATH</span><span class="si">}</span>,triton_max_batch_size:<span class="si">${</span><span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="si">}</span>,postprocessing_instance_count:<span class="si">${</span><span class="nv">POSTPROCESSING_INSTANCE_COUNT</span><span class="si">}</span>
python3<span class="w"> </span>tools/fill_template.py<span class="w"> </span>-i<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/tensorrt_llm_bls/config.pbtxt<span class="w"> </span>triton_max_batch_size:<span class="si">${</span><span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="si">}</span>,decoupled_mode:<span class="si">${</span><span class="nv">DECOUPLED_MODE</span><span class="si">}</span>,accumulate_tokens:<span class="si">${</span><span class="nv">ACCUMULATE_TOKEN</span><span class="si">}</span>,bls_instance_count:<span class="si">${</span><span class="nv">BLS_INSTANCE_COUNT</span><span class="si">}</span>,tensorrt_llm_model_name:<span class="si">${</span><span class="nv">TENSORRT_LLM_MODEL_NAME</span><span class="si">}</span>,tensorrt_llm_draft_model_name:<span class="si">${</span><span class="nv">TENSORRT_LLM_DRAFT_MODEL_NAME</span><span class="si">}</span>

<span class="c1"># Make a copy of tensorrt_llm as configurations of draft / target models.</span>
cp<span class="w"> </span>-R<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/tensorrt_llm<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/tensorrt_llm_draft
sed<span class="w"> </span>-i<span class="w"> </span><span class="s1">&#39;s/name: &quot;tensorrt_llm&quot;/name: &quot;tensorrt_llm_draft&quot;/g&#39;</span><span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/tensorrt_llm_draft/config.pbtxt
python3<span class="w"> </span>tools/fill_template.py<span class="w"> </span>-i<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/tensorrt_llm/config.pbtxt<span class="w">          </span>triton_backend:<span class="si">${</span><span class="nv">BACKEND</span><span class="si">}</span>,engine_dir:<span class="si">${</span><span class="nv">ENGINE_PATH</span><span class="si">}</span>,decoupled_mode:<span class="si">${</span><span class="nv">DECOUPLED_MODE</span><span class="si">}</span>,max_tokens_in_paged_kv_cache:<span class="si">${</span><span class="nv">MAX_TOKENS_IN_KV_CACHE</span><span class="si">}</span>,max_attention_window_size:<span class="si">${</span><span class="nv">MAX_ATTENTION_WINDOW_SIZE</span><span class="si">}</span>,batch_scheduler_policy:<span class="si">${</span><span class="nv">BATCH_SCHEDULER_POLICY</span><span class="si">}</span>,batching_strategy:<span class="si">${</span><span class="nv">BATCHING_STRATEGY</span><span class="si">}</span>,kv_cache_free_gpu_mem_fraction:<span class="si">${</span><span class="nv">KV_CACHE_FREE_GPU_MEM_FRACTION</span><span class="si">}</span>,exclude_input_in_output:<span class="si">${</span><span class="nv">EXCLUDE_INPUT_IN_OUTPUT</span><span class="si">}</span>,triton_max_batch_size:<span class="si">${</span><span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="si">}</span>,max_queue_delay_microseconds:<span class="si">${</span><span class="nv">MAX_QUEUE_DELAY_MICROSECONDS</span><span class="si">}</span>,max_beam_width:<span class="si">${</span><span class="nv">MAX_BEAM_WIDTH</span><span class="si">}</span>,enable_kv_cache_reuse:<span class="si">${</span><span class="nv">ENABLE_KV_CACHE_REUSE</span><span class="si">}</span>,normalize_log_probs:<span class="si">${</span><span class="nv">NORMALIZE_LOG_PROBS</span><span class="si">}</span>,enable_chunked_context:<span class="si">${</span><span class="nv">ENABLE_CHUNKED_CONTEXT</span><span class="si">}</span>,gpu_device_ids:<span class="si">${</span><span class="nv">TARGET_GPU_DEVICE_IDS</span><span class="si">}</span>,decoding_mode:<span class="si">${</span><span class="nv">DECODING_MODE</span><span class="si">}</span>,encoder_input_features_data_type:TYPE_FP16
python3<span class="w"> </span>tools/fill_template.py<span class="w"> </span>-i<span class="w"> </span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span>/tensorrt_llm_draft/config.pbtxt<span class="w">    </span>triton_backend:<span class="si">${</span><span class="nv">BACKEND</span><span class="si">}</span>,engine_dir:<span class="si">${</span><span class="nv">DRAFT_ENGINE_PATH</span><span class="si">}</span>,decoupled_mode:<span class="si">${</span><span class="nv">DECOUPLED_MODE</span><span class="si">}</span>,max_tokens_in_paged_kv_cache:<span class="si">${</span><span class="nv">MAX_TOKENS_IN_KV_CACHE</span><span class="si">}</span>,max_attention_window_size:<span class="si">${</span><span class="nv">MAX_ATTENTION_WINDOW_SIZE</span><span class="si">}</span>,batch_scheduler_policy:<span class="si">${</span><span class="nv">BATCH_SCHEDULER_POLICY</span><span class="si">}</span>,batching_strategy:<span class="si">${</span><span class="nv">BATCHING_STRATEGY</span><span class="si">}</span>,kv_cache_free_gpu_mem_fraction:<span class="si">${</span><span class="nv">KV_CACHE_FREE_GPU_MEM_FRACTION</span><span class="si">}</span>,exclude_input_in_output:<span class="si">${</span><span class="nv">EXCLUDE_INPUT_IN_OUTPUT</span><span class="si">}</span>,triton_max_batch_size:<span class="si">${</span><span class="nv">TRITON_MAX_BATCH_SIZE</span><span class="si">}</span>,max_queue_delay_microseconds:<span class="si">${</span><span class="nv">MAX_QUEUE_DELAY_MICROSECONDS</span><span class="si">}</span>,max_beam_width:<span class="si">${</span><span class="nv">MAX_BEAM_WIDTH</span><span class="si">}</span>,enable_kv_cache_reuse:<span class="si">${</span><span class="nv">ENABLE_KV_CACHE_REUSE</span><span class="si">}</span>,normalize_log_probs:<span class="si">${</span><span class="nv">NORMALIZE_LOG_PROBS</span><span class="si">}</span>,enable_chunked_context:<span class="si">${</span><span class="nv">ENABLE_CHUNKED_CONTEXT</span><span class="si">}</span>,gpu_device_ids:<span class="si">${</span><span class="nv">DRAFT_GPU_DEVICE_IDS</span><span class="si">}</span>,decoding_mode:<span class="si">${</span><span class="nv">DECODING_MODE</span><span class="si">}</span>
</pre></div>
</div>
</li>
<li><p>Launch Triton server</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">--multi-model</span></code> is necessary if TP mode is used for target model.</p></li>
</ul>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python3<span class="w"> </span>scripts/launch_triton_server.py<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--model_repo<span class="o">=</span><span class="si">${</span><span class="nv">TRITON_REPO</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--tensorrt_llm_model_name<span class="w"> </span><span class="s2">&quot;tensorrt_llm,tensorrt_llm_draft&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--multi-model<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--log<span class="w"> </span><span class="p">&amp;</span>
</pre></div>
</div>
<ul class="simple">
<li><p>Verbose log will be written in to file <code class="docutils literal notranslate"><span class="pre">triton_log.txt</span></code>. Triton server launches successfully if you see the output below in the file:</p></li>
</ul>
<div class="highlight-txt notranslate"><div class="highlight"><pre><span></span>Started HTTPService at 0.0.0.0:8000
Started GRPCInferenceService at 0.0.0.0:8001
Started Metrics Service at 0.0.0.0:8002
</pre></div>
</div>
</li>
<li><p>Send Requests</p>
<ul class="simple">
<li><p>Prepare a JSON file <code class="docutils literal notranslate"><span class="pre">input_data.json</span></code> containing input data as below (more requests are acceptable).</p></li>
</ul>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">[</span>
<span class="w">    </span><span class="p">{</span>
<span class="w">        </span><span class="nt">&quot;input&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;James Best, best known for his &quot;</span><span class="p">,</span>
<span class="w">        </span><span class="nt">&quot;instruction&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Continue writing the following story:&quot;</span><span class="p">,</span>
<span class="w">        </span><span class="nt">&quot;output&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;                                                                &quot;</span>
<span class="w">    </span><span class="p">}</span>
<span class="p">]</span>
</pre></div>
</div>
<ul class="simple">
<li><p>Use command below to launch requests for inference.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">--num-draft-tokens</span></code> can be modified by runtime draft lengths, 4 is used in this example.</p></li>
</ul>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python3<span class="w"> </span>tools/inflight_batcher_llm/speculative_decoding_test.py<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--max-input-len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--dataset<span class="o">=</span>input_data.json<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--url-target<span class="o">=</span>localhost:8001<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--url-draft<span class="o">=</span>localhost:8001<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--draft-tensorrt-llm-model-name<span class="o">=</span><span class="s2">&quot;</span><span class="si">${</span><span class="nv">TENSORRT_LLM_DRAFT_MODEL_NAME</span><span class="si">}</span><span class="s2">&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--target-tensorrt-llm-model-name<span class="o">=</span><span class="s2">&quot;</span><span class="si">${</span><span class="nv">TENSORRT_LLM_MODEL_NAME</span><span class="si">}</span><span class="s2">&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--bls-speculative-tensorrt-llm-model-name<span class="o">=</span><span class="s2">&quot;tensorrt_llm_bls&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--execute-bls-speculative-decoding<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--disable-output-comparison<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--num-draft-tokens<span class="o">=</span><span class="m">4</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--verbose
</pre></div>
</div>
</li>
<li><p>Enable fast logits D2D transfer when <code class="docutils literal notranslate"><span class="pre">&quot;use_draft_logits&quot;:</span> <span class="pre">True</span></code></p>
<ul class="simple">
<li><p>Obtaining adjusted logits distribution from draft logits is a proposed method in the <a class="reference external" href="https://arxiv.org/pdf/2211.17192.pdf">Fast Inference from Transformers via Speculative Decoding paper</a>. Fast logits feature boosts the performance (TPS) by hiding the latency of logits transfer from draft engine to target engine.</p></li>
<li><p>Fast logits feature is newly supported in TensorRT-LLM-0.15.0.</p></li>
<li><p>Modify <code class="docutils literal notranslate"><span class="pre">participant_ids</span></code> entry in <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/config.pbtxt</span></code> and <code class="docutils literal notranslate"><span class="pre">tensorrt_llm_draft/config.pbtxt</span></code> to suitable MPI ranks. Usually in this setting, rank 0 is reserved for the orchestrator rank; rank 1 is for draft engine; the rest of the ranks are for target engine. In this example, <code class="docutils literal notranslate"><span class="pre">particpant_ids</span></code> can be set as snippet below. Same logic also applies to TP&gt;1 target engine.</p></li>
</ul>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1">### In tensorrt_llm_draft/config.pbtxt</span>
<span class="n">parameters</span><span class="p">:</span> <span class="p">{</span>
    <span class="n">key</span><span class="p">:</span> <span class="s2">&quot;gpu_device_ids&quot;</span>
    <span class="n">value</span><span class="p">:</span> <span class="p">{</span>
        <span class="n">string_value</span><span class="p">:</span> <span class="s2">&quot;0&quot;</span>
    <span class="p">}</span>
<span class="p">}</span>
<span class="n">parameters</span><span class="p">:</span> <span class="p">{</span>
    <span class="n">key</span><span class="p">:</span> <span class="s2">&quot;participant_ids&quot;</span>
    <span class="n">value</span><span class="p">:</span> <span class="p">{</span>
        <span class="n">string_value</span><span class="p">:</span> <span class="s2">&quot;1&quot;</span>
    <span class="p">}</span>
<span class="p">}</span>
<span class="c1">### In tensorrt_llm/config.pbtxt</span>
<span class="n">parameters</span><span class="p">:</span> <span class="p">{</span>
    <span class="n">key</span><span class="p">:</span> <span class="s2">&quot;gpu_device_ids&quot;</span>
    <span class="n">value</span><span class="p">:</span> <span class="p">{</span>
        <span class="n">string_value</span><span class="p">:</span> <span class="s2">&quot;1&quot;</span>
    <span class="p">}</span>
<span class="p">}</span>
<span class="n">parameters</span><span class="p">:</span> <span class="p">{</span>
    <span class="n">key</span><span class="p">:</span> <span class="s2">&quot;participant_ids&quot;</span>
    <span class="n">value</span><span class="p">:</span> <span class="p">{</span>
        <span class="n">string_value</span><span class="p">:</span> <span class="s2">&quot;2&quot;</span>
    <span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<ul class="simple">
<li><p>Enable <code class="docutils literal notranslate"><span class="pre">speculative_decoding_fast_logits</span></code> in both <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/config.pbtxt</span></code> and <code class="docutils literal notranslate"><span class="pre">tensorrt_llm_draft/config.pbtxt</span></code>.</p></li>
</ul>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">parameters</span><span class="p">:</span> <span class="p">{</span>
    <span class="n">key</span><span class="p">:</span> <span class="s2">&quot;speculative_decoding_fast_logits&quot;</span>
    <span class="n">value</span><span class="p">:</span> <span class="p">{</span>
        <span class="n">string_value</span><span class="p">:</span> <span class="s2">&quot;1&quot;</span>
    <span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<ul class="simple">
<li><p>Fast logits feature requires Tritonserver to be launched in orchestrator mode with <code class="docutils literal notranslate"><span class="pre">--disable-spawn-process</span></code>. See <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md">model config</a> for more information. <code class="docutils literal notranslate"><span class="pre">--world_size</span></code> has to be set as 1 (orchestrator rank 0) + 1 (draft engine ranks) + 1 (target engine ranks).</p></li>
</ul>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python3<span class="w"> </span>scripts/launch_triton_server.py<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--model_repo<span class="o">=</span><span class="nv">$TRITON_REPO</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--tensorrt_llm_model_name<span class="w"> </span><span class="s2">&quot;tensorrt_llm,tensorrt_llm_draft&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--multi-model<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--disable-spawn-processes<span class="w"> </span><span class="se">\</span>
<span class="w">    </span>--world_size<span class="o">=</span><span class="m">3</span><span class="w"> </span>--log<span class="w"> </span><span class="p">&amp;</span>
</pre></div>
</div>
<ul class="simple">
<li><p>Send request with <code class="docutils literal notranslate"><span class="pre">use_draft_logits</span></code> to tritonserver BLS API:</p></li>
</ul>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">curl</span> <span class="o">-</span><span class="n">X</span> <span class="n">POST</span> <span class="s2">&quot;http://localhost:8000/v2/models/tensorrt_llm_bls/generate&quot;</span> \
    <span class="o">-</span><span class="n">H</span> <span class="s2">&quot;Content-Type: application/json&quot;</span> \
    <span class="o">-</span><span class="n">d</span> <span class="s1">&#39;{</span>
        <span class="s2">&quot;text_input&quot;</span><span class="p">:</span> <span class="s2">&quot;Continue writing the following story: James Best, best known for his&quot;</span><span class="p">,</span>
        <span class="s2">&quot;max_tokens&quot;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span>
        <span class="s2">&quot;num_draft_tokens&quot;</span><span class="p">:</span> <span class="mi">10</span><span class="p">,</span>
        <span class="s2">&quot;use_draft_logits&quot;</span><span class="p">:</span> <span class="n">true</span><span class="p">,</span>
        <span class="s2">&quot;stream&quot;</span><span class="p">:</span> <span class="n">false</span>
        <span class="p">}</span><span class="s1">&#39;</span>
</pre></div>
</div>
<ul class="simple">
<li><p>With the fast logits enabled and following optimization tips in <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md#some-tips-for-model-configuration">model configuration</a>, speculative decoding with draft logits achieves 2.x throughput in BS1, 1.x throughput in BS16 comparing to auto-regressive decoding using Llama 3.2 1B draft and Llama 3.1 70B target.</p></li>
</ul>
</li>
<li><p>Kill Tritonserver after finishing inference</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pkill<span class="w"> </span>-9<span class="w"> </span>-f<span class="w"> </span>trtllmExecutorWorker
pkill<span class="w"> </span>-9<span class="w"> </span>-f<span class="w"> </span>tritonserver
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="prompt-lookup-decoding">
<h2>Prompt-Lookup-Decoding<a class="headerlink" href="#prompt-lookup-decoding" title="Link to this heading"></a></h2>
<p>See document in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/README.md">examples/prompt_lookup/README.md</a> and the code can be found in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/run_dtm_pld.py">examples/prompt_lookup/run_dtm_pld.py</a>.</p>
</section>
<section id="medusa">
<h2>Medusa<a class="headerlink" href="#medusa" title="Link to this heading"></a></h2>
<p>This approach leverages a single model to both generate and verify draft tokens.
It enhances the existing model by adding multiple extra language model heads, known as Medusa heads.
These additional heads are trained to predict future tokens while the base model remains unchanged.
Specifically, the first Medusa head is tasked with predicting the immediate next token,
the second head predicts the token after that, and so on.
With <code class="docutils literal notranslate"><span class="pre">K</span></code> Medusa heads, the model can forecast up to <code class="docutils literal notranslate"><span class="pre">K</span></code> tokens ahead.
The draft tokens generated by the Medusa heads during iteration <code class="docutils literal notranslate"><span class="pre">i</span></code>
are then verified and potentially accepted in the subsequent iteration, <code class="docutils literal notranslate"><span class="pre">i+1</span></code>.</p>
<p>The true potential of the Medusa strategy is realized when more than one token per head is used,
employing a TopK approach to create multiple potential paths, essentially forming a tree, rather than
a single linear path as seen in the Draft model approach. To reduce redundant computations, many of these paths,
which often share common prefixes, are consolidated into a single path.
This is achieved by applying attention with a sparse mask that represents the various paths. Sparse mask formed by Medusa tree is described in detail later.</p>
<p>By validating multiple paths simultaneously, there is an increased likelihood of accepting more than one token per iteration,
albeit at the expense of additional computational effort.</p>
<p>It is crucial to recognize that as the number of potential paths grows exponentially with <code class="docutils literal notranslate"><span class="pre">K</span></code>,
it is not necessary to explore or validate all of them. A recommended strategy for managing this complexity is to prune the tree
by focusing only on the paths with higher-probability tokens.</p>
<p>You must strike a balance between the breadth and depth of the tree you want to explore and the impact of a larger tree on the overall
performance for your specific application.</p>
<p>In the TensorRT-LLM implementation of Medusa, the configuration of the tree is a runtime parameter.
This flexibility allows you to experiment and identify the optimal tree structure for your use case,
which can then be utilized in a production environment.</p>
<section id="medusa-tree">
<h3>Medusa Tree<a class="headerlink" href="#medusa-tree" title="Link to this heading"></a></h3>
<p>Consider the following diagram, which illustrates how the hidden states from the last layer of the base model
are passed to the base model’s language model (LM) head and to four Medusa heads (MHs).</p>
<p align="center">
    <img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/media/medusa_tree.svg?raw=true" alt="Example Medusa Tree" width="auto" height="auto">
</p>
<p>In this example:</p>
<ol class="arabic simple">
<li><p>The token <code>l<sub>0</sub></code> represents the actual token generated by the model.
All other tokens, denoted as <code>p<sub>hk</sub></code>, are predictions from the MHs,
where <code class="docutils literal notranslate"><span class="pre">h</span></code> indicates the Medusa head index (1-based) and <code class="docutils literal notranslate"><span class="pre">k</span></code> represents the TopK choice index (0-based).</p></li>
<li><p>Four MHs are used, which means the model is predicting four future tokens.</p></li>
<li><p>The first two MHs utilize Top-2 predictions, while the last two use Top-1.
For instance, <code>p<sub>10</sub></code> and <code>p<sub>11</sub></code> are the top and
second top predictions from the first Medusa Head (MH1).</p></li>
<li><p>A total of four paths are explored, which is fewer than the 16 that would be examined
if a complete binary tree were used (assuming Top-2 predictions for all MHs).</p></li>
<li><p>As some of these paths may be accepted, there are ten potential candidates, referred to as <code class="docutils literal notranslate"><span class="pre">medusa_choices</span></code>.
The number of tokens that can be accepted at each step, including the true token,
ranges from 1 (if all Medusa predictions are incorrect) to 5 (if all are correct).</p></li>
</ol>
<p>During the generation phase, the model receives an input of 10 tokens,
which corresponds to the last tokens of each candidate path, rather than just one.</p>
<p>In TensorRT-LLM, you have the option to define such trees by providing all the Medusa choices
or by simply specifying the unique paths.</p>
<ul class="simple">
<li><p>Since each candidate/path begins with the true token (<code>l<sub>0</sub></code>),
there is no need to specify it separately. For the predicted tokens, only the TopK indices are required.</p></li>
<li><p>For example, to specify the path <code>l<sub>0</sub>p<sub>10</sub>p<sub>21</sub>p<sub>30</sub></code>,
one would use <code class="docutils literal notranslate"><span class="pre">[0,1,0]</span></code>. And
to specify the path <code>l<sub>0</sub>p<sub>11</sub>p<sub>20</sub></code>,
one would use <code class="docutils literal notranslate"><span class="pre">[1,0]</span></code>.</p></li>
<li><p>To specify all 4 paths in the example, use <code class="docutils literal notranslate"><span class="pre">medusa_choices=[[0,0,0,0],</span> <span class="pre">[0,1,0],</span> <span class="pre">[1,0],</span> <span class="pre">[1,1]]</span></code>.</p></li>
<li><p>It’s also possible to specify all candidates explicitly, similar to the Medusa repository.
For instance, <code class="docutils literal notranslate"><span class="pre">medusa_choices=[[0],</span> <span class="pre">[0,0],</span> <span class="pre">[0,0,0],</span> <span class="pre">[0,0,0,0],</span> <span class="pre">[0,1],</span> <span class="pre">[0,1,0],</span> <span class="pre">[1],</span> <span class="pre">[1,0],</span> <span class="pre">[1,1]]</span></code>. Note that when specifying all the candidates explicitly, <strong>we don’t include
the empty <code class="docutils literal notranslate"><span class="pre">[]</span></code> candidate</strong> for the case where only the true token is accepted, that is, all the predictions from MHs are wrong.
So, only <code class="docutils literal notranslate"><span class="pre">9</span></code> candidates are specified.</p></li>
</ul>
<p><strong>Specifying paths-only instead of all choices is currently supported only in the Python runtime.</strong></p>
</section>
<section id="using-medusa-with-tensorrt-llm">
<h3>Using Medusa with TensorRT-LLM<a class="headerlink" href="#using-medusa-with-tensorrt-llm" title="Link to this heading"></a></h3>
<p>For guidance on constructing and executing Medusa with the Python runtime, consult the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa/README.md">Medusa README</a>. When utilizing the Inflight Fused Batching (IFB) with the C++ API, it is necessary to define the <code class="docutils literal notranslate"><span class="pre">medusa_choices</span></code> explicitly within the model configuration. For detailed instructions, refer to the <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration">model configuration in TensorRT-LLM backend</a> for more details.</p>
<section id="limitations">
<h4>Limitations<a class="headerlink" href="#limitations" title="Link to this heading"></a></h4>
<ul class="simple">
<li><p>TensorRT-LLM supports Medusa only for Vicuna (fine tuned LLaMA).
However, similar to any new model, you can follow the same approach to define your own Medusa model and deploy with TensorRT-LLM.</p></li>
<li><p>We match only tokens during the validation phase that is <code class="docutils literal notranslate"><span class="pre">medusa_temperature=0</span></code>.</p></li>
<li><p>Beam search is <strong>not</strong> compatible with Medusa.</p></li>
</ul>
</section>
</section>
</section>
<section id="redrafter">
<h2>ReDrafter<a class="headerlink" href="#redrafter" title="Link to this heading"></a></h2>
<p>The ReDrafter approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. However, unlike Medusa, it predicts draft tokens using a recurrent predictor, where each draft token depends on the previous one. This method also allows the use of beam search to identify more prominent draft tokens. For more details, please read <a class="reference external" href="https://arxiv.org/html/2403.09919v1">the ReDrafter paper</a>.</p>
<p>TensorRT-LLM implements the ReDrafter model such that logits prediction, beam search, and draft token acceptance are performed inside the TensorRT engine. This contrasts with standard model inference, which only predicts logits and performs decoding outside the engine. Since the engine predicts explicit draft tokens instead of implicit tokens decoded from logits, we categorize this speculative decoding method as <code class="docutils literal notranslate"><span class="pre">explicit_draft_tokens</span></code>. Please, visit the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/redrafter/README.md">ReDrafter README</a> for information about building and running the model. ReDrafter supports both Inflight Fused Batching runtime and Python static batching runtime.</p>
</section>
<section id="eagle">
<h2>EAGLE<a class="headerlink" href="#eagle" title="Link to this heading"></a></h2>
<p>The EAGLE approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. Similarly to ReDrafter, it predicts draft tokens using a recurrent predictor where each draft token depends on the previous one. However, unlike ReDrafter, it uses a single-layer transformer model to predict draft tokens from previous hidden states and decoded tokens. In the EAGLE-1 decoding tree needs to be known during the decoding. In the EAGLE-2 this tree is asssembled during the execution by searching for the most probable hypothesis along the beam.</p>
<p>Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine. Only EAGLE-1 is supported. Please, visit the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/eagle/README.md">EAGLE README</a> for information about building and running the model.</p>
</section>
<section id="lookahead-decoding">
<h2>Lookahead Decoding<a class="headerlink" href="#lookahead-decoding" title="Link to this heading"></a></h2>
<p>Lookahead decoding algorithm operates through two parallel computation branches within the same model: a lookahead branch that generates n-grams using a fixed-sized 2D window, and a verification branch that validates promising n-gram candidates. This approach eliminates the necessity for additional model training or fine-tuning and can be enabled for any autoregressive model. Refer to the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/lookahead/README.md">Lookahead decoding README</a> for information about building and running the model.</p>
</section>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="kv-cache-reuse.html" class="btn btn-neutral float-left" title="KV cache reuse" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="disaggregated-service.html" class="btn btn-neutral float-right" title="Disaggregated-Service (experimental)" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7da747c04d40>

<div class="footer">
    <p>
        Copyright © 2024 NVIDIA Corporation
    </p>
    <p>
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
            data-cms-ai="0">Privacy Policy</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
            data-cms-ai="0">Manage My Privacy</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
            data-cms-ai="0">Do Not Sell or Share My Data</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
            rel="noopener" data-cms-ai="0">Terms of Service</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
            data-cms-ai="0">Accessibility</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
            rel="noopener" data-cms-ai="0">Corporate Policies</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
            data-cms-ai="0">Product Security</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
            data-cms-ai="0">Contact</a>
    </p>
</div>


  </div>


</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

</body>
</html>