TensorRT-LLMs/release-notes.html
2024-11-01 20:31:15 +08:00

1129 lines
93 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Release Notes &mdash; tensorrt_llm documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
<link rel="stylesheet" type="text/css" href="_static/copybutton.css?v=76b2166b" />
<!--[if lt IE 9]>
<script src="_static/js/html5shiv.min.js"></script>
<![endif]-->
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/clipboard.min.js?v=a7894cd8"></script>
<script src="_static/copybutton.js?v=f281be69"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Installing on Linux" href="installation/linux.html" />
<link rel="prev" title="Key Features" href="key-features.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
tensorrt_llm
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="key-features.html">Key Features</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Release Notes</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-14-0">TensorRT-LLM Release 0.14.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#key-features-and-enhancements">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#api-changes">API Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#model-updates">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#fixed-issues">Fixed Issues</a></li>
<li class="toctree-l3"><a class="reference internal" href="#infrastructure-changes">Infrastructure Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#documentation">Documentation</a></li>
<li class="toctree-l3"><a class="reference internal" href="#known-issues">Known Issues</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-13-0">TensorRT-LLM Release 0.13.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id2">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id3">API Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id4">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id5">Fixed Issues</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id6">Infrastructure Changes</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-12-0">TensorRT-LLM Release 0.12.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id7">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id8">API Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id9">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id10">Fixed Issues</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id11">Infrastructure Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id12">Known Issues</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-11-0">TensorRT-LLM Release 0.11.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id13">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id14">API Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id15">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id16">Fixed Issues</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id17">Infrastructure Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id18">Known Issues</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-10-0">TensorRT-LLM Release 0.10.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#announcements">Announcements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id19">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id20">API Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id21">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id22">Fixed Issues</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id23">Infrastructure changes</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-9-0">TensorRT-LLM Release 0.9.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id24">Announcements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id25">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id26">API Changes</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id27">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#limitations">Limitations</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id28">Fixed Issues</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-8-0">TensorRT-LLM Release 0.8.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id29">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id30">Model Updates</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt-llm-release-0-7-1">TensorRT-LLM Release 0.7.1</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id31">Key Features and Enhancements</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id32">Model Updates</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id33">Fixed Issues</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id34">Known Issues</a></li>
</ul>
</li>
</ul>
</li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation/windows.html">Installing on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="llm-api/index.html">API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm-api/reference.html">API Reference</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="llm-api-examples/index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm-api-examples/customization.html">Common Customizations</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm-api-examples/llm_api_examples.html">Examples</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="commands/trtllm-build.html">trtllm-build</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/core-concepts.html#compilation">Compilation</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/core-concepts.html#runtime">Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="architecture/add-model.html">Adding a Model</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/batch-manager.html">The Batch Manager in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/inference-request.html">Inference Request</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/inference-request.html#responses">Responses</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html#lookahead-decoding">Lookahead decoding</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="performance/perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="performance/perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="performance/perf-best-practices.html">Best Practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="performance/perf-analysis.html">Performance Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">tensorrt_llm</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">Release Notes</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/release-notes.md.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="release-notes">
<span id="id1"></span><h1>Release Notes<a class="headerlink" href="#release-notes" title="Link to this heading"></a></h1>
<p>All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our <a class="reference external" href="https://forums.developer.nvidia.com/">NVIDIA Developer Forum</a>.</p>
<section id="tensorrt-llm-release-0-14-0">
<h2>TensorRT-LLM Release 0.14.0<a class="headerlink" href="#tensorrt-llm-release-0-14-0" title="Link to this heading"></a></h2>
<section id="key-features-and-enhancements">
<h3>Key Features and Enhancements<a class="headerlink" href="#key-features-and-enhancements" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Enhanced the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class in the <a class="reference external" href="https://nvidia.github.io/TensorRT-LLM/llm-api/index.html">LLM API</a>.</p>
<ul>
<li><p>Added support for calibration with offline dataset.</p></li>
<li><p>Added support for Mamba2.</p></li>
<li><p>Added support for <code class="docutils literal notranslate"><span class="pre">finish_reason</span></code> and <code class="docutils literal notranslate"><span class="pre">stop_reason</span></code>.</p></li>
</ul>
</li>
<li><p>Added FP8 support for CodeLlama.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">__repr__</span></code> methods for class <code class="docutils literal notranslate"><span class="pre">Module</span></code>, thanks to the contribution from &#64;1ytic in #2191.</p></li>
<li><p>Added BFloat16 support for fused gated MLP.</p></li>
<li><p>Updated ReDrafter beam search logic to match Apple ReDrafter v1.1.</p></li>
<li><p>Improved <code class="docutils literal notranslate"><span class="pre">customAllReduce</span></code> performance.</p></li>
<li><p>Draft model now can copy logits directly over MPI to the target models process in <code class="docutils literal notranslate"><span class="pre">orchestrator</span></code> mode. This fast logits copy reduces the delay between draft token generation and the beginning of target model inference.</p></li>
<li><p>NVIDIA Volta GPU support is deprecated and will be removed in a future release.</p></li>
</ul>
</section>
<section id="api-changes">
<h3>API Changes<a class="headerlink" href="#api-changes" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>[BREAKING CHANGE] The default <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> of the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command is set to <code class="docutils literal notranslate"><span class="pre">2048</span></code>.</p></li>
<li><p>[BREAKING CHANGE] Remove <code class="docutils literal notranslate"><span class="pre">builder_opt</span></code> from the <code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code> class and the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.</p></li>
<li><p>Add logits post-processor support to the <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code> class.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">isParticipant</span></code> method to the C++ <code class="docutils literal notranslate"><span class="pre">Executor</span></code> API to check if the current process is a participant in the executor instance.</p></li>
</ul>
</section>
<section id="model-updates">
<h3>Model Updates<a class="headerlink" href="#model-updates" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Added support for NemotronNas, see <code class="docutils literal notranslate"><span class="pre">examples/nemotron_nas/README.md</span></code>.</p></li>
<li><p>Added support for Deepseek-v1, see <code class="docutils literal notranslate"><span class="pre">examples/deepseek_v1/README.md</span></code>.</p></li>
<li><p>Added support for Phi-3.5 models, see <code class="docutils literal notranslate"><span class="pre">examples/phi/README.md</span></code>.</p></li>
</ul>
</section>
<section id="fixed-issues">
<h3>Fixed Issues<a class="headerlink" href="#fixed-issues" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Fixed a typo in <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/models/model_weights_loader.py</span></code>, thanks to the contribution from &#64;wangkuiyi in #2152.</p></li>
<li><p>Fixed duplicated import module in <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/runtime/generation.py</span></code>, thanks to the contribution from &#64;lkm2835 in #2182.</p></li>
<li><p>Enabled <code class="docutils literal notranslate"><span class="pre">share_embedding</span></code> for the models that have no <code class="docutils literal notranslate"><span class="pre">lm_head</span></code> in legacy checkpoint conversion path, thanks to the contribution from &#64;lkm2835 in #2232.</p></li>
<li><p>Fixed <code class="docutils literal notranslate"><span class="pre">kv_cache_type</span></code> issue in the Python benchmark, thanks to the contribution from &#64;qingquansong in #2219.</p></li>
<li><p>Fixed an issue with SmoothQuant calibration with custom datasets. Thanks to the contribution by &#64;Bhuvanesh09 in #2243.</p></li>
<li><p>Fixed an issue surrounding <code class="docutils literal notranslate"><span class="pre">trtllm-build</span> <span class="pre">--fast-build</span></code> with fake or random weights. Thanks to &#64;ZJLi2013 for flagging it in #2135.</p></li>
<li><p>Fixed missing <code class="docutils literal notranslate"><span class="pre">use_fused_mlp</span></code> when constructing <code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code> from dict, thanks for the fix from &#64;ethnzhng in #2081.</p></li>
<li><p>Fixed lookahead batch layout for <code class="docutils literal notranslate"><span class="pre">numNewTokensCumSum</span></code>. (#2263)</p></li>
</ul>
</section>
<section id="infrastructure-changes">
<h3>Infrastructure Changes<a class="headerlink" href="#infrastructure-changes" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>The dependent ModelOpt version is updated to v0.17.</p></li>
</ul>
</section>
<section id="documentation">
<h3>Documentation<a class="headerlink" href="#documentation" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>&#64;Sherlock113 added a <a class="reference external" href="https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml">tech blog</a> to the latest news in #2169, thanks for the contribution.</p></li>
</ul>
</section>
<section id="known-issues">
<h3>Known Issues<a class="headerlink" href="#known-issues" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Replit Code is not supported with the transformers 4.45+</p></li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-13-0">
<h2>TensorRT-LLM Release 0.13.0<a class="headerlink" href="#tensorrt-llm-release-0-13-0" title="Link to this heading"></a></h2>
<section id="id2">
<h3>Key Features and Enhancements<a class="headerlink" href="#id2" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Supported lookahead decoding (experimental), see <code class="docutils literal notranslate"><span class="pre">docs/source/speculative_decoding.md</span></code>.</p></li>
<li><p>Added some enhancements to the <code class="docutils literal notranslate"><span class="pre">ModelWeightsLoader</span></code> (a unified checkpoint converter, see <code class="docutils literal notranslate"><span class="pre">docs/source/architecture/model-weights-loader.md</span></code>).</p>
<ul>
<li><p>Supported Qwen models.</p></li>
<li><p>Supported auto-padding for indivisible TP shape in INT4-wo/INT8-wo/INT4-GPTQ.</p></li>
<li><p>Improved performance on <code class="docutils literal notranslate"><span class="pre">*.bin</span></code> and <code class="docutils literal notranslate"><span class="pre">*.pth</span></code>.</p></li>
</ul>
</li>
<li><p>Supported OpenAI Whisper in C++ runtime.</p></li>
<li><p>Added some enhancements to the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class.</p>
<ul>
<li><p>Supported LoRA.</p></li>
<li><p>Supported engine building using dummy weights.</p></li>
<li><p>Supported <code class="docutils literal notranslate"><span class="pre">trust_remote_code</span></code> for customized models and tokenizers downloaded from Hugging Face Hub.</p></li>
</ul>
</li>
<li><p>Supported beam search for streaming mode.</p></li>
<li><p>Supported tensor parallelism for Mamba2.</p></li>
<li><p>Supported returning generation logits for streaming mode.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">curand</span></code> and <code class="docutils literal notranslate"><span class="pre">bfloat16</span></code> support for <code class="docutils literal notranslate"><span class="pre">ReDrafter</span></code>.</p></li>
<li><p>Added sparse mixer normalization mode for MoE models.</p></li>
<li><p>Added support for QKV scaling in FP8 FMHA.</p></li>
<li><p>Supported FP8 for MoE LoRA.</p></li>
<li><p>Supported KV cache reuse for P-Tuning and LoRA.</p></li>
<li><p>Supported in-flight batching for CogVLM models.</p></li>
<li><p>Supported LoRA for the <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code> class.</p></li>
<li><p>Supported <code class="docutils literal notranslate"><span class="pre">head_size=48</span></code> cases for FMHA kernels.</p></li>
<li><p>Added FP8 examples for DiT models, see <code class="docutils literal notranslate"><span class="pre">examples/dit/README.md</span></code>.</p></li>
<li><p>Supported decoder with encoder input features for the C++ <code class="docutils literal notranslate"><span class="pre">executor</span></code> API.</p></li>
</ul>
</section>
<section id="id3">
<h3>API Changes<a class="headerlink" href="#id3" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>[BREAKING CHANGE] Set <code class="docutils literal notranslate"><span class="pre">use_fused_mlp</span></code> to <code class="docutils literal notranslate"><span class="pre">True</span></code> by default.</p></li>
<li><p>[BREAKING CHANGE] Enabled <code class="docutils literal notranslate"><span class="pre">multi_block_mode</span></code> by default.</p></li>
<li><p>[BREAKING CHANGE] Enabled <code class="docutils literal notranslate"><span class="pre">strongly_typed</span></code> by default in <code class="docutils literal notranslate"><span class="pre">builder</span></code> API.</p></li>
<li><p>[BREAKING CHANGE] Renamed <code class="docutils literal notranslate"><span class="pre">maxNewTokens</span></code>, <code class="docutils literal notranslate"><span class="pre">randomSeed</span></code> and <code class="docutils literal notranslate"><span class="pre">minLength</span></code> to <code class="docutils literal notranslate"><span class="pre">maxTokens</span></code>, <code class="docutils literal notranslate"><span class="pre">seed</span></code> and <code class="docutils literal notranslate"><span class="pre">minTokens</span></code> following OpenAI style.</p></li>
<li><p>The <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class</p>
<ul>
<li><p>[BREAKING CHANGE] Updated <code class="docutils literal notranslate"><span class="pre">LLM.generate</span></code> arguments to include <code class="docutils literal notranslate"><span class="pre">PromptInputs</span></code> and <code class="docutils literal notranslate"><span class="pre">tqdm</span></code>.</p></li>
</ul>
</li>
<li><p>The C++ <code class="docutils literal notranslate"><span class="pre">executor</span></code> API</p>
<ul>
<li><p>[BREAKING CHANGE] Added <code class="docutils literal notranslate"><span class="pre">LogitsPostProcessorConfig</span></code>.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">FinishReason</span></code> to <code class="docutils literal notranslate"><span class="pre">Result</span></code>.</p></li>
</ul>
</li>
</ul>
</section>
<section id="id4">
<h3>Model Updates<a class="headerlink" href="#id4" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Supported Gemma 2, see “Run Gemma 2” section in <code class="docutils literal notranslate"><span class="pre">examples/gemma/README.md</span></code>.</p></li>
</ul>
</section>
<section id="id5">
<h3>Fixed Issues<a class="headerlink" href="#id5" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Fixed an accuracy issue when enabling remove padding issue for cross attention. (#1999)</p></li>
<li><p>Fixed the failure in converting qwen2-0.5b-instruct when using <code class="docutils literal notranslate"><span class="pre">smoothquant</span></code>. (#2087)</p></li>
<li><p>Matched the <code class="docutils literal notranslate"><span class="pre">exclude_modules</span></code> pattern in <code class="docutils literal notranslate"><span class="pre">convert_utils.py</span></code> to the changes in <code class="docutils literal notranslate"><span class="pre">quantize.py</span></code>. (#2113)</p></li>
<li><p>Fixed build engine error when <code class="docutils literal notranslate"><span class="pre">FORCE_NCCL_ALL_REDUCE_STRATEGY</span></code> is set.</p></li>
<li><p>Fixed unexpected truncation in the quant mode of <code class="docutils literal notranslate"><span class="pre">gpt_attention</span></code>.</p></li>
<li><p>Fixed the hang caused by race condition when canceling requests.</p></li>
<li><p>Fixed the default factory for <code class="docutils literal notranslate"><span class="pre">LoraConfig</span></code>. (#1323)</p></li>
</ul>
</section>
<section id="id6">
<h3>Infrastructure Changes<a class="headerlink" href="#id6" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Base Docker image for TensorRT-LLM is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/pytorch:24.07-py3</span></code>.</p></li>
<li><p>Base Docker image for TensorRT-LLM Backend is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/tritonserver:24.07-py3</span></code>.</p></li>
<li><p>The dependent TensorRT version is updated to 10.4.0.</p></li>
<li><p>The dependent CUDA version is updated to 12.5.1.</p></li>
<li><p>The dependent PyTorch version is updated to 2.4.0.</p></li>
<li><p>The dependent ModelOpt version is updated to v0.15.</p></li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-12-0">
<h2>TensorRT-LLM Release 0.12.0<a class="headerlink" href="#tensorrt-llm-release-0-12-0" title="Link to this heading"></a></h2>
<section id="id7">
<h3>Key Features and Enhancements<a class="headerlink" href="#id7" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Supported LoRA for MoE models.</p></li>
<li><p>The <code class="docutils literal notranslate"><span class="pre">ModelWeightsLoader</span></code> is enabled for LLaMA family models (experimental), see <code class="docutils literal notranslate"><span class="pre">docs/source/architecture/model-weights-loader.md</span></code>.</p></li>
<li><p>Supported FP8 FMHA for NVIDIA Ada Lovelace Architecture.</p></li>
<li><p>Supported GPT-J, Phi, Phi-3, Qwen, GPT, GLM, Baichuan, Falcon and Gemma models for the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class.</p></li>
<li><p>Supported FP8 OOTB MoE.</p></li>
<li><p>Supported Starcoder2 SmoothQuant. (#1886)</p></li>
<li><p>Supported ReDrafter Speculative Decoding, see “ReDrafter” section in <code class="docutils literal notranslate"><span class="pre">docs/source/speculative_decoding.md</span></code>.</p></li>
<li><p>Supported padding removal for BERT, thanks to the contribution from &#64;Altair-Alpha in #1834.</p></li>
<li><p>Added in-flight batching support for GLM 10B model.</p></li>
<li><p>Supported <code class="docutils literal notranslate"><span class="pre">gelu_pytorch_tanh</span></code> activation function, thanks to the contribution from &#64;ttim in #1897.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">chunk_length</span></code> parameter to Whisper, thanks to the contribution from &#64;MahmoudAshraf97 in #1909.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">concurrency</span></code> argument for <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code>.</p></li>
<li><p>Executor API supports requests with different beam widths, see <code class="docutils literal notranslate"><span class="pre">docs/source/executor.md#sending-requests-with-different-beam-widths</span></code>.</p></li>
<li><p>Added the flag <code class="docutils literal notranslate"><span class="pre">--fast_build</span></code> to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command (experimental).</p></li>
</ul>
</section>
<section id="id8">
<h3>API Changes<a class="headerlink" href="#id8" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>[BREAKING CHANGE] <code class="docutils literal notranslate"><span class="pre">max_output_len</span></code> is removed from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command, if you want to limit sequence length on engine build stage, specify <code class="docutils literal notranslate"><span class="pre">max_seq_len</span></code>.</p></li>
<li><p>[BREAKING CHANGE] The <code class="docutils literal notranslate"><span class="pre">use_custom_all_reduce</span></code> argument is removed from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code>.</p></li>
<li><p>[BREAKING CHANGE] The <code class="docutils literal notranslate"><span class="pre">multi_block_mode</span></code> argument is moved from build stage (<code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> and builder API) to the runtime.</p></li>
<li><p>[BREAKING CHANGE] The build time argument <code class="docutils literal notranslate"><span class="pre">context_fmha_fp32_acc</span></code> is moved to runtime for decoder models.</p></li>
<li><p>[BREAKING CHANGE] The arguments <code class="docutils literal notranslate"><span class="pre">tp_size</span></code>, <code class="docutils literal notranslate"><span class="pre">pp_size</span></code> and <code class="docutils literal notranslate"><span class="pre">cp_size</span></code> is removed from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.</p></li>
<li><p>The C++ batch manager API is deprecated in favor of the C++ <code class="docutils literal notranslate"><span class="pre">executor</span></code> API, and it will be removed in a future release of TensorRT-LLM.</p></li>
<li><p>Added a version API to the C++ library, a <code class="docutils literal notranslate"><span class="pre">cpp/include/tensorrt_llm/executor/version.h</span></code> file is going to be generated.</p></li>
</ul>
</section>
<section id="id9">
<h3>Model Updates<a class="headerlink" href="#id9" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Supported LLaMA 3.1 model.</p></li>
<li><p>Supported Mamba-2 model.</p></li>
<li><p>Supported EXAONE model, see <code class="docutils literal notranslate"><span class="pre">examples/exaone/README.md</span></code>.</p></li>
<li><p>Supported Qwen 2 model.</p></li>
<li><p>Supported GLM4 models, see <code class="docutils literal notranslate"><span class="pre">examples/chatglm/README.md</span></code>.</p></li>
<li><p>Added LLaVa-1.6 (LLaVa-NeXT) multimodal support, see “LLaVA, LLaVa-NeXT and VILA” section in <code class="docutils literal notranslate"><span class="pre">examples/multimodal/README.md</span></code>.</p></li>
</ul>
</section>
<section id="id10">
<h3>Fixed Issues<a class="headerlink" href="#id10" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Fixed wrong pad token for the CodeQwen models. (#1953)</p></li>
<li><p>Fixed typo in <code class="docutils literal notranslate"><span class="pre">cluster_infos</span></code> defined in <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/auto_parallel/cluster_info.py</span></code>, thanks to the contribution from &#64;saeyoonoh in #1987.</p></li>
<li><p>Removed duplicated flags in the command at <code class="docutils literal notranslate"><span class="pre">docs/source/reference/troubleshooting.md</span></code>, thanks for the contribution from &#64;hattizai in #1937.</p></li>
<li><p>Fixed segmentation fault in TopP sampling layer, thanks to the contribution from &#64;akhoroshev in #2039. (#2040)</p></li>
<li><p>Fixed the failure when converting the checkpoint for Mistral Nemo model. (#1985)</p></li>
<li><p>Propagated <code class="docutils literal notranslate"><span class="pre">exclude_modules</span></code> to weight-only quantization, thanks to the contribution from &#64;fjosw in #2056.</p></li>
<li><p>Fixed wrong links in README, thanks to the contribution from &#64;Tayef-Shah in #2028.</p></li>
<li><p>Fixed some typos in the documentation, thanks to the contribution from &#64;lfz941 in #1939.</p></li>
<li><p>Fixed the engine build failure when deduced <code class="docutils literal notranslate"><span class="pre">max_seq_len</span></code> is not an integer. (#2018)</p></li>
</ul>
</section>
<section id="id11">
<h3>Infrastructure Changes<a class="headerlink" href="#id11" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Base Docker image for TensorRT-LLM is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/pytorch:24.07-py3</span></code>.</p></li>
<li><p>Base Docker image for TensorRT-LLM Backend is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/tritonserver:24.07-py3</span></code>.</p></li>
<li><p>The dependent TensorRT version is updated to 10.3.0.</p></li>
<li><p>The dependent CUDA version is updated to 12.5.1.</p></li>
<li><p>The dependent PyTorch version is updated to 2.4.0.</p></li>
<li><p>The dependent ModelOpt version is updated to v0.15.0.</p></li>
</ul>
</section>
<section id="id12">
<h3>Known Issues<a class="headerlink" href="#id12" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>On Windows, installation of TensorRT-LLM may succeed, but you might hit <code class="docutils literal notranslate"><span class="pre">OSError:</span> <span class="pre">exception:</span> <span class="pre">access</span> <span class="pre">violation</span> <span class="pre">reading</span> <span class="pre">0x0000000000000000</span></code> when importing the library in Python. See <a class="reference external" href="https://nvidia.github.io/TensorRT-LLM/installation/windows.html">Installing on Windows</a> for workarounds.</p></li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-11-0">
<h2>TensorRT-LLM Release 0.11.0<a class="headerlink" href="#tensorrt-llm-release-0-11-0" title="Link to this heading"></a></h2>
<section id="id13">
<h3>Key Features and Enhancements<a class="headerlink" href="#id13" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Supported very long context for LLaMA (see “Long context evaluation” section in <code class="docutils literal notranslate"><span class="pre">examples/llama/README.md</span></code>).</p></li>
<li><p>Low latency optimization</p>
<ul>
<li><p>Added a reduce-norm feature which aims to fuse the ResidualAdd and LayerNorm kernels after AllReduce into a single kernel, which is recommended to be enabled when the batch size is small and the generation phase time is dominant.</p></li>
<li><p>Added FP8 support to the GEMM plugin, which benefits the cases when batch size is smaller than 4.</p></li>
<li><p>Added a fused GEMM-SwiGLU plugin for FP8 on SM90.</p></li>
</ul>
</li>
<li><p>LoRA enhancements</p>
<ul>
<li><p>Supported running FP8 LLaMA with FP16 LoRA checkpoints.</p></li>
<li><p>Added support for quantized base model and FP16/BF16 LoRA.</p>
<ul>
<li><p>SQ OOTB (- INT8 A/W) + FP16/BF16/FP32 LoRA</p></li>
<li><p>INT8/ INT4 Weight-Only (INT8 /W) + FP16/BF16/FP32 LoRA</p></li>
<li><p>Weight-Only Group-wise + FP16/BF16/FP32 LoRA</p></li>
</ul>
</li>
<li><p>Added LoRA support to Qwen2, see “Run models with LoRA” section in <code class="docutils literal notranslate"><span class="pre">examples/qwen/README.md</span></code>.</p></li>
<li><p>Added support for Phi-3-mini/small FP8 base + FP16/BF16 LoRA, see “Run Phi-3 with LoRA” section in <code class="docutils literal notranslate"><span class="pre">examples/phi/README.md</span></code>.</p></li>
<li><p>Added support for starcoder-v2 FP8 base + FP16/BF16 LoRA, see “Run StarCoder2 with LoRA” section in <code class="docutils literal notranslate"><span class="pre">examples/gpt/README.md</span></code>.</p></li>
</ul>
</li>
<li><p>Encoder-decoder models C++ runtime enhancements</p>
<ul>
<li><p>Supported paged KV cache and inflight batching. (#800)</p></li>
<li><p>Supported tensor parallelism.</p></li>
</ul>
</li>
<li><p>Supported INT8 quantization with embedding layer excluded.</p></li>
<li><p>Updated default model for Whisper to <code class="docutils literal notranslate"><span class="pre">distil-whisper/distil-large-v3</span></code>, thanks to the contribution from &#64;IbrahimAmin1 in #1337.</p></li>
<li><p>Supported HuggingFace model automatically download for the Python high level API.</p></li>
<li><p>Supported explicit draft tokens for in-flight batching.</p></li>
<li><p>Supported local custom calibration datasets, thanks to the contribution from &#64;DreamGenX in #1762.</p></li>
<li><p>Added batched logits post processor.</p></li>
<li><p>Added Hopper qgmma kernel to XQA JIT codepath.</p></li>
<li><p>Supported tensor parallelism and expert parallelism enabled together for MoE.</p></li>
<li><p>Supported the pipeline parallelism cases when the number of layers cannot be divided by PP size.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">numQueuedRequests</span></code> to the iteration stats log of the executor API.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">iterLatencyMilliSec</span></code> to the iteration stats log of the executor API.</p></li>
<li><p>Add HuggingFace model zoo from the community, thanks to the contribution from &#64;matichon-vultureprime in #1674.</p></li>
</ul>
</section>
<section id="id14">
<h3>API Changes<a class="headerlink" href="#id14" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>[BREAKING CHANGE] <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command</p>
<ul>
<li><p>Migrated Whisper to unified workflow (<code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command), see documents: examples/whisper/README.md.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> in <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command is switched to 256 by default.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code> in <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command is switched to 8192 by default.</p></li>
<li><p>Deprecated <code class="docutils literal notranslate"><span class="pre">max_output_len</span></code> and added <code class="docutils literal notranslate"><span class="pre">max_seq_len</span></code>.</p></li>
<li><p>Removed unnecessary <code class="docutils literal notranslate"><span class="pre">--weight_only_precision</span></code> argument from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.</p></li>
<li><p>Removed <code class="docutils literal notranslate"><span class="pre">attention_qk_half_accumulation</span></code> argument from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.</p></li>
<li><p>Removed <code class="docutils literal notranslate"><span class="pre">use_context_fmha_for_generation</span></code> argument from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.</p></li>
<li><p>Removed <code class="docutils literal notranslate"><span class="pre">strongly_typed</span></code> argument from <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command.</p></li>
<li><p>The default value of <code class="docutils literal notranslate"><span class="pre">max_seq_len</span></code> reads from the HuggingFace mode config now.</p></li>
</ul>
</li>
<li><p>C++ runtime</p>
<ul>
<li><p>[BREAKING CHANGE] Renamed <code class="docutils literal notranslate"><span class="pre">free_gpu_memory_fraction</span></code> in <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code> to <code class="docutils literal notranslate"><span class="pre">kv_cache_free_gpu_memory_fraction</span></code>.</p></li>
<li><p>[BREAKING CHANGE] Refactored <code class="docutils literal notranslate"><span class="pre">GptManager</span></code> API</p>
<ul>
<li><p>Moved <code class="docutils literal notranslate"><span class="pre">maxBeamWidth</span></code> into <code class="docutils literal notranslate"><span class="pre">TrtGptModelOptionalParams</span></code>.</p></li>
<li><p>Moved <code class="docutils literal notranslate"><span class="pre">schedulerConfig</span></code> into <code class="docutils literal notranslate"><span class="pre">TrtGptModelOptionalParams</span></code>.</p></li>
</ul>
</li>
<li><p>Added some more options to <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code>, including <code class="docutils literal notranslate"><span class="pre">max_tokens_in_paged_kv_cache</span></code>, <code class="docutils literal notranslate"><span class="pre">kv_cache_enable_block_reuse</span></code> and <code class="docutils literal notranslate"><span class="pre">enable_chunked_context</span></code>.</p></li>
</ul>
</li>
<li><p>[BREAKING CHANGE] Python high-level API</p>
<ul>
<li><p>Removed the <code class="docutils literal notranslate"><span class="pre">ModelConfig</span></code> class, and all the options are moved to <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class.</p></li>
<li><p>Refactored the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class, please refer to <code class="docutils literal notranslate"><span class="pre">examples/high-level-api/README.md</span></code></p>
<ul>
<li><p>Moved the most commonly used options in the explicit arg-list, and hidden the expert options in the kwargs.</p></li>
<li><p>Exposed <code class="docutils literal notranslate"><span class="pre">model</span></code> to accept either HuggingFace model name or local HuggingFace model/TensorRT-LLM checkpoint/TensorRT-LLM engine.</p></li>
<li><p>Support downloading model from HuggingFace model hub, currently only Llama variants are supported.</p></li>
<li><p>Support build cache to reuse the built TensorRT-LLM engines by setting environment variable <code class="docutils literal notranslate"><span class="pre">TLLM_HLAPI_BUILD_CACHE=1</span></code> or passing <code class="docutils literal notranslate"><span class="pre">enable_build_cache=True</span></code> to <code class="docutils literal notranslate"><span class="pre">LLM</span></code> class.</p></li>
<li><p>Exposed low-level options including <code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code>, <code class="docutils literal notranslate"><span class="pre">SchedulerConfig</span></code> and so on in the kwargs, ideally you should be able to configure details about the build and runtime phase.</p></li>
</ul>
</li>
<li><p>Refactored <code class="docutils literal notranslate"><span class="pre">LLM.generate()</span></code> and <code class="docutils literal notranslate"><span class="pre">LLM.generate_async()</span></code> API.</p>
<ul>
<li><p>Removed <code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code>.</p></li>
<li><p>Added <code class="docutils literal notranslate"><span class="pre">SamplingParams</span></code> with more extensive parameters, see <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/hlapi/utils.py</span></code>.</p>
<ul>
<li><p>The new <code class="docutils literal notranslate"><span class="pre">SamplingParams</span></code> contains and manages fields from Python bindings of <code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code>, <code class="docutils literal notranslate"><span class="pre">OutputConfig</span></code>, and so on.</p></li>
</ul>
</li>
<li><p>Refactored <code class="docutils literal notranslate"><span class="pre">LLM.generate()</span></code> output as <code class="docutils literal notranslate"><span class="pre">RequestOutput</span></code>, see <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/hlapi/llm.py</span></code>.</p></li>
</ul>
</li>
<li><p>Updated the <code class="docutils literal notranslate"><span class="pre">apps</span></code> examples, specially by rewriting both <code class="docutils literal notranslate"><span class="pre">chat.py</span></code> and <code class="docutils literal notranslate"><span class="pre">fastapi_server.py</span></code> using the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> APIs, please refer to the <code class="docutils literal notranslate"><span class="pre">examples/apps/README.md</span></code> for details.</p>
<ul>
<li><p>Updated the <code class="docutils literal notranslate"><span class="pre">chat.py</span></code> to support multi-turn conversation, allowing users to chat with a model in the terminal.</p></li>
<li><p>Fixed the <code class="docutils literal notranslate"><span class="pre">fastapi_server.py</span></code> and eliminate the need for <code class="docutils literal notranslate"><span class="pre">mpirun</span></code> in multi-GPU scenarios.</p></li>
</ul>
</li>
</ul>
</li>
<li><p>[BREAKING CHANGE] Speculative decoding configurations unification</p>
<ul>
<li><p>Introduction of <code class="docutils literal notranslate"><span class="pre">SpeculativeDecodingMode.h</span></code> to choose between different speculative decoding techniques.</p></li>
<li><p>Introduction of <code class="docutils literal notranslate"><span class="pre">SpeculativeDecodingModule.h</span></code> base class for speculative decoding techniques.</p></li>
<li><p>Removed <code class="docutils literal notranslate"><span class="pre">decodingMode.h</span></code>.</p></li>
</ul>
</li>
<li><p><code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code></p>
<ul>
<li><p>[BREAKING CHANGE] <code class="docutils literal notranslate"><span class="pre">api</span></code> in <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code> command is <code class="docutils literal notranslate"><span class="pre">executor</span></code> by default now.</p></li>
<li><p>Added a runtime <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code>.</p></li>
<li><p>Added a runtime <code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code>.</p></li>
</ul>
</li>
<li><p>[BREAKING CHANGE] Added a <code class="docutils literal notranslate"><span class="pre">bias</span></code> argument to the <code class="docutils literal notranslate"><span class="pre">LayerNorm</span></code> module, and supports non-bias layer normalization.</p></li>
<li><p>[BREAKING CHANGE] Removed <code class="docutils literal notranslate"><span class="pre">GptSession</span></code> Python bindings.</p></li>
</ul>
</section>
<section id="id15">
<h3>Model Updates<a class="headerlink" href="#id15" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Supported Jais, see <code class="docutils literal notranslate"><span class="pre">examples/jais/README.md</span></code>.</p></li>
<li><p>Supported DiT, see <code class="docutils literal notranslate"><span class="pre">examples/dit/README.md</span></code>.</p></li>
<li><p>Supported VILA 1.5.</p></li>
<li><p>Supported Video NeVA, see <code class="docutils literal notranslate"><span class="pre">Video</span> <span class="pre">NeVA</span></code>section in <code class="docutils literal notranslate"><span class="pre">examples/multimodal/README.md</span></code>.</p></li>
<li><p>Supported Grok-1, see <code class="docutils literal notranslate"><span class="pre">examples/grok/README.md</span></code>.</p></li>
<li><p>Supported Qwen1.5-110B with FP8 PTQ.</p></li>
<li><p>Supported Phi-3 small model with block sparse attention.</p></li>
<li><p>Supported InternLM2 7B/20B, thanks to the contribution from &#64;RunningLeon in #1392.</p></li>
<li><p>Supported Phi-3-medium models, see <code class="docutils literal notranslate"><span class="pre">examples/phi/README.md</span></code>.</p></li>
<li><p>Supported Qwen1.5 MoE A2.7B.</p></li>
<li><p>Supported phi 3 vision multimodal.</p></li>
</ul>
</section>
<section id="id16">
<h3>Fixed Issues<a class="headerlink" href="#id16" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Fixed brokens outputs for the cases when batch size is larger than 1. (#1539)</p></li>
<li><p>Fixed <code class="docutils literal notranslate"><span class="pre">top_k</span></code> type in <code class="docutils literal notranslate"><span class="pre">executor.py</span></code>, thanks to the contribution from &#64;vonjackustc in #1329.</p></li>
<li><p>Fixed stop and bad word list pointer offset in Python runtime, thanks to the contribution from &#64;fjosw in #1486.</p></li>
<li><p>Fixed some typos for Whisper model, thanks to the contribution from &#64;Pzzzzz5142 in #1328.</p></li>
<li><p>Fixed export failure with CUDA driver &lt; 526 and pynvml &gt;= 11.5.0, thanks to the contribution from &#64;CoderHam in #1537.</p></li>
<li><p>Fixed an issue in NMT weight conversion, thanks to the contribution from &#64;Pzzzzz5142 in #1660.</p></li>
<li><p>Fixed LLaMA Smooth Quant conversion, thanks to the contribution from &#64;lopuhin in #1650.</p></li>
<li><p>Fixed <code class="docutils literal notranslate"><span class="pre">qkv_bias</span></code> shape issue for Qwen1.5-32B (#1589), thanks to the contribution from &#64;Tlntin in #1637.</p></li>
<li><p>Fixed the error of Ada traits for <code class="docutils literal notranslate"><span class="pre">fpA_intB</span></code>, thanks to the contribution from &#64;JamesTheZ in #1583.</p></li>
<li><p>Update <code class="docutils literal notranslate"><span class="pre">examples/qwenvl/requirements.txt</span></code>, thanks to the contribution from &#64;ngoanpv in #1248.</p></li>
<li><p>Fixed rsLoRA scaling in <code class="docutils literal notranslate"><span class="pre">lora_manager</span></code>, thanks to the contribution from &#64;TheCodeWrangler in #1669.</p></li>
<li><p>Fixed Qwen1.5 checkpoint convert failure #1675.</p></li>
<li><p>Fixed Medusa safetensors and AWQ conversion, thanks to the contribution from &#64;Tushar-ml in #1535.</p></li>
<li><p>Fixed <code class="docutils literal notranslate"><span class="pre">convert_hf_mpt_legacy</span></code> call failure when the function is called in other than global scope, thanks to the contribution from &#64;bloodeagle40234 in #1534.</p></li>
<li><p>Fixed <code class="docutils literal notranslate"><span class="pre">use_fp8_context_fmha</span></code> broken outputs (#1539).</p></li>
<li><p>Fixed pre-norm weight conversion for NMT models, thanks to the contribution from &#64;Pzzzzz5142 in #1723.</p></li>
<li><p>Fixed random seed initialization issue, thanks to the contribution from &#64;pathorn in #1742.</p></li>
<li><p>Fixed stop words and bad words in python bindings. (#1642)</p></li>
<li><p>Fixed the issue that when converting checkpoint for Mistral 7B v0.3, thanks to the contribution from &#64;Ace-RR: #1732.</p></li>
<li><p>Fixed broken inflight batching for fp8 Llama and Mixtral, thanks to the contribution from &#64;bprus: #1738</p></li>
<li><p>Fixed the failure when <code class="docutils literal notranslate"><span class="pre">quantize.py</span></code> is export data to config.json, thanks to the contribution from &#64;janpetrov: #1676</p></li>
<li><p>Raise error when autopp detects unsupported quant plugin #1626.</p></li>
<li><p>Fixed the issue that <code class="docutils literal notranslate"><span class="pre">shared_embedding_table</span></code> is not being set when loading Gemma #1799, thanks to the contribution from &#64;mfuntowicz.</p></li>
<li><p>Fixed stop and bad words list contiguous for <code class="docutils literal notranslate"><span class="pre">ModelRunner</span></code> #1815, thanks to the contribution from &#64;Marks101.</p></li>
<li><p>Fixed missing comment for <code class="docutils literal notranslate"><span class="pre">FAST_BUILD</span></code>, thanks to the support from &#64;lkm2835 in #1851.</p></li>
<li><p>Fixed the issues that Top-P sampling occasionally produces invalid tokens. #1590</p></li>
<li><p>Fixed #1424.</p></li>
<li><p>Fixed #1529.</p></li>
<li><p>Fixed <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/README.md</span></code> for #1562 and #1552.</p></li>
<li><p>Fixed dead link, thanks to the help from &#64;DefTruth, &#64;buvnswrn and &#64;sunjiabin17 in: https://github.com/triton-inference-server/tensorrtllm_backend/pull/478, https://github.com/triton-inference-server/tensorrtllm_backend/pull/482 and https://github.com/triton-inference-server/tensorrtllm_backend/pull/449.</p></li>
</ul>
</section>
<section id="id17">
<h3>Infrastructure Changes<a class="headerlink" href="#id17" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Base Docker image for TensorRT-LLM is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/pytorch:24.05-py3</span></code>.</p></li>
<li><p>Base Docker image for TensorRT-LLM backend is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/tritonserver:24.05-py3</span></code>.</p></li>
<li><p>The dependent TensorRT version is updated to 10.2.0.</p></li>
<li><p>The dependent CUDA version is updated to 12.4.1.</p></li>
<li><p>The dependent PyTorch version is updated to 2.3.1.</p></li>
<li><p>The dependent ModelOpt version is updated to v0.13.0.</p></li>
</ul>
</section>
<section id="id18">
<h3>Known Issues<a class="headerlink" href="#id18" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>In a conda environment on Windows, installation of TensorRT-LLM may succeed. However, when importing the library in Python, you may receive an error message of <code class="docutils literal notranslate"><span class="pre">OSError:</span> <span class="pre">exception:</span> <span class="pre">access</span> <span class="pre">violation</span> <span class="pre">reading</span> <span class="pre">0x0000000000000000</span></code>. This issue is under investigation.</p></li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-10-0">
<h2>TensorRT-LLM Release 0.10.0<a class="headerlink" href="#tensorrt-llm-release-0-10-0" title="Link to this heading"></a></h2>
<section id="announcements">
<h3>Announcements<a class="headerlink" href="#announcements" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>TensorRT-LLM supports TensorRT 10.0.1 and NVIDIA NGC 24.03 containers.</p></li>
</ul>
</section>
<section id="id19">
<h3>Key Features and Enhancements<a class="headerlink" href="#id19" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>The Python high level API</p>
<ul>
<li><p>Added embedding parallel, embedding sharing, and fused MLP support.</p></li>
<li><p>Enabled the usage of the <code class="docutils literal notranslate"><span class="pre">executor</span></code> API.</p></li>
</ul>
</li>
<li><p>Added a weight-stripping feature with a new <code class="docutils literal notranslate"><span class="pre">trtllm-refit</span></code> command. For more information, refer to <code class="docutils literal notranslate"><span class="pre">examples/sample_weight_stripping/README.md</span></code>.</p></li>
<li><p>Added a weight-streaming feature. For more information, refer to <code class="docutils literal notranslate"><span class="pre">docs/source/advanced/weight-streaming.md</span></code>.</p></li>
<li><p>Enhanced the multiple profiles feature; <code class="docutils literal notranslate"><span class="pre">--multiple_profiles</span></code> argument in <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command builds more optimization profiles now for better performance.</p></li>
<li><p>Added FP8 quantization support for Mixtral.</p></li>
<li><p>Added support for pipeline parallelism for GPT.</p></li>
<li><p>Optimized <code class="docutils literal notranslate"><span class="pre">applyBiasRopeUpdateKVCache</span></code> kernel by avoiding re-computation.</p></li>
<li><p>Reduced overheads between <code class="docutils literal notranslate"><span class="pre">enqueue</span></code> calls of TensorRT engines.</p></li>
<li><p>Added support for paged KV cache for enc-dec models. The support is limited to beam width 1.</p></li>
<li><p>Added W4A(fp)8 CUTLASS kernels for the NVIDIA Ada Lovelace architecture.</p></li>
<li><p>Added debug options (<code class="docutils literal notranslate"><span class="pre">--visualize_network</span></code> and <code class="docutils literal notranslate"><span class="pre">--dry_run</span></code>) to the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command to visualize the TensorRT network before engine build.</p></li>
<li><p>Integrated the new NVIDIA Hopper XQA kernels for LLaMA 2 70B model.</p></li>
<li><p>Improved the performance of pipeline parallelism when enabling in-flight batching.</p></li>
<li><p>Supported quantization for Nemotron models.</p></li>
<li><p>Added LoRA support for Mixtral and Qwen.</p></li>
<li><p>Added in-flight batching support for ChatGLM models.</p></li>
<li><p>Added support to <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code> so that it runs with the <code class="docutils literal notranslate"><span class="pre">executor</span></code> API for IFB-compatible models.</p></li>
<li><p>Enhanced the custom <code class="docutils literal notranslate"><span class="pre">AllReduce</span></code> by adding a heuristic; fall back to use native NCCL kernel when hardware requirements are not satisfied to get the best performance.</p></li>
<li><p>Optimized the performance of checkpoint conversion process for LLaMA.</p></li>
<li><p>Benchmark</p>
<ul>
<li><p>[BREAKING CHANGE] Moved the request rate generation arguments and logic from prepare dataset script to <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code>.</p></li>
<li><p>Enabled streaming and support <code class="docutils literal notranslate"><span class="pre">Time</span> <span class="pre">To</span> <span class="pre">the</span> <span class="pre">First</span> <span class="pre">Token</span> <span class="pre">(TTFT)</span></code> latency and <code class="docutils literal notranslate"><span class="pre">Inter-Token</span> <span class="pre">Latency</span> <span class="pre">(ITL)</span></code> metrics for <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code>.</p></li>
<li><p>Added the <code class="docutils literal notranslate"><span class="pre">--max_attention_window</span></code> option to <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code>.</p></li>
</ul>
</li>
</ul>
</section>
<section id="id20">
<h3>API Changes<a class="headerlink" href="#id20" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>[BREAKING CHANGE] Set the default <code class="docutils literal notranslate"><span class="pre">tokens_per_block</span></code> argument of the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command to 64 for better performance.</p></li>
<li><p>[BREAKING CHANGE] Migrated enc-dec models to the unified workflow.</p></li>
<li><p>[BREAKING CHANGE] Renamed <code class="docutils literal notranslate"><span class="pre">GptModelConfig</span></code> to <code class="docutils literal notranslate"><span class="pre">ModelConfig</span></code>.</p></li>
<li><p>[BREAKING CHANGE] Added speculative decoding mode to the builder API.</p></li>
<li><p>[BREAKING CHANGE] Refactor scheduling configurations</p>
<ul>
<li><p>Unified the <code class="docutils literal notranslate"><span class="pre">SchedulerPolicy</span></code> with the same name in <code class="docutils literal notranslate"><span class="pre">batch_scheduler</span></code> and <code class="docutils literal notranslate"><span class="pre">executor</span></code>, and renamed it to <code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy</span></code>.</p></li>
<li><p>Expanded the existing configuration scheduling strategy from <code class="docutils literal notranslate"><span class="pre">SchedulerPolicy</span></code> to <code class="docutils literal notranslate"><span class="pre">SchedulerConfig</span></code> to enhance extensibility. The latter also introduces a chunk-based configuration called <code class="docutils literal notranslate"><span class="pre">ContextChunkingPolicy</span></code>.</p></li>
</ul>
</li>
<li><p>[BREAKING CHANGE] The input prompt was removed from the generation output in the <code class="docutils literal notranslate"><span class="pre">generate()</span></code> and <code class="docutils literal notranslate"><span class="pre">generate_async()</span></code> APIs. For example, when given a prompt as <code class="docutils literal notranslate"><span class="pre">A</span> <span class="pre">B</span></code>, the original generation result could be <code class="docutils literal notranslate"><span class="pre">&lt;s&gt;A</span> <span class="pre">B</span> <span class="pre">C</span> <span class="pre">D</span> <span class="pre">E</span></code> where only <code class="docutils literal notranslate"><span class="pre">C</span> <span class="pre">D</span> <span class="pre">E</span></code> is the actual output, and now the result is <code class="docutils literal notranslate"><span class="pre">C</span> <span class="pre">D</span> <span class="pre">E</span></code>.</p></li>
<li><p>[BREAKING CHANGE] Switched default <code class="docutils literal notranslate"><span class="pre">add_special_token</span></code> in the TensorRT-LLM backend to <code class="docutils literal notranslate"><span class="pre">True</span></code>.</p></li>
<li><p>Deprecated <code class="docutils literal notranslate"><span class="pre">GptSession</span></code> and <code class="docutils literal notranslate"><span class="pre">TrtGptModelV1</span></code>.</p></li>
</ul>
</section>
<section id="id21">
<h3>Model Updates<a class="headerlink" href="#id21" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Support DBRX</p></li>
<li><p>Support Qwen2</p></li>
<li><p>Support CogVLM</p></li>
<li><p>Support ByT5</p></li>
<li><p>Support LLaMA 3</p></li>
<li><p>Support Arctic (w/ FP8)</p></li>
<li><p>Support Fuyu</p></li>
<li><p>Support Persimmon</p></li>
<li><p>Support Deplot</p></li>
<li><p>Support Phi-3-Mini with long Rope</p></li>
<li><p>Support Neva</p></li>
<li><p>Support Kosmos-2</p></li>
<li><p>Support RecurrentGemma</p></li>
</ul>
</section>
<section id="id22">
<h3>Fixed Issues<a class="headerlink" href="#id22" title="Link to this heading"></a></h3>
<ul class="simple">
<li><ul>
<li><p>Fixed some unexpected behaviors in beam search and early stopping, so that the outputs are more accurate.</p></li>
</ul>
</li>
<li><p>Fixed segmentation fault with pipeline parallelism and <code class="docutils literal notranslate"><span class="pre">gather_all_token_logits</span></code>. (#1284)</p></li>
<li><p>Removed the unnecessary check in XQA to fix code Llama 70b Triton crashes. (#1256)</p></li>
<li><p>Fixed an unsupported ScalarType issue for BF16 LoRA. (https://github.com/triton-inference-server/tensorrtllm_backend/issues/403)</p></li>
<li><p>Eliminated the load and save of prompt table in multimodal. (https://github.com/NVIDIA/TensorRT-LLM/discussions/1436)</p></li>
<li><p>Fixed an error when converting the models weights of Qwen 72B INT4-GPTQ. (#1344)</p></li>
<li><p>Fixed early stopping and failures on in-flight batching cases of Medusa. (#1449)</p></li>
<li><p>Added support for more NVLink versions for auto parallelism. (#1467)</p></li>
<li><p>Fixed the assert failure caused by default values of sampling config. (#1447)</p></li>
<li><p>Fixed a requirement specification on Windows for nvidia-cudnn-cu12. (#1446)</p></li>
<li><p>Fixed MMHA relative position calculation error in <code class="docutils literal notranslate"><span class="pre">gpt_attention_plugin</span></code> for enc-dec models. (#1343)</p></li>
</ul>
</section>
<section id="id23">
<h3>Infrastructure changes<a class="headerlink" href="#id23" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Base Docker image for TensorRT-LLM is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/pytorch:24.03-py3</span></code>.</p></li>
<li><p>Base Docker image for TensorRT-LLM backend is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/tritonserver:24.03-py3</span></code>.</p></li>
<li><p>The dependent TensorRT version is updated to 10.0.1.</p></li>
<li><p>The dependent CUDA version is updated to 12.4.0.</p></li>
<li><p>The dependent PyTorch version is updated to 2.2.2.</p></li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-9-0">
<h2>TensorRT-LLM Release 0.9.0<a class="headerlink" href="#tensorrt-llm-release-0-9-0" title="Link to this heading"></a></h2>
<section id="id24">
<h3>Announcements<a class="headerlink" href="#id24" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>TensorRT-LLM requires TensorRT 9.3 and 24.02 containers.</p></li>
</ul>
</section>
<section id="id25">
<h3>Key Features and Enhancements<a class="headerlink" href="#id25" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p><strong>[BREAKING CHANGES]</strong> TopP sampling optimization with deterministic AIR TopP algorithm is enabled by default</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Added support for embedding sharing for Gemma</p></li>
<li><p>Added support for context chunking to work with KV cache reuse</p></li>
<li><p>Enabled different rewind tokens per sequence for Medusa</p></li>
<li><p>Added BART LoRA support (limited to the Python runtime)</p></li>
<li><p>Enabled multi-LoRA for BART LoRA</p></li>
<li><p>Added support for <code class="docutils literal notranslate"><span class="pre">early_stopping=False</span></code> in beam search for C++ Runtime</p></li>
<li><p>Added support for logits post processor to the batch manager</p></li>
<li><p>Added support for import and convert HuggingFace Gemma checkpoints</p></li>
<li><p>Added support for loading Gemma from HuggingFace</p></li>
<li><p>Added support for auto parallelism planner for high-level API and unified builder workflow</p></li>
<li><p>Added support for running <code class="docutils literal notranslate"><span class="pre">GptSession</span></code> without OpenMPI</p></li>
<li><p>Added support for Medusa IFB</p></li>
<li><p><strong>[Experimental]</strong> Added support for FP8 FMHA, note that the performance is not optimal, and we will keep optimizing it</p></li>
<li><p>Added support for more head sizes for LLaMA-like models</p>
<ul>
<li><p>NVIDIA Ampere (SM80, SM86), NVIDIA Ada Lovelace (SM89), NVIDIA Hopper (SM90) all support head sizes [32, 40, 64, 80, 96, 104, 128, 160, 256]</p></li>
</ul>
</li>
<li><p>Added support for OOTB functionality</p>
<ul>
<li><p>T5</p></li>
<li><p>Mixtral 8x7B</p></li>
</ul>
</li>
<li><p>Benchmark features</p>
<ul>
<li><p>Added emulated static batching in <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code></p></li>
<li><p>Added support for arbitrary dataset from HuggingFace for C++ benchmarks</p></li>
<li><p>Added percentile latency report to <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code></p></li>
</ul>
</li>
<li><p>Performance features</p>
<ul>
<li><p>Optimized <code class="docutils literal notranslate"><span class="pre">gptDecoderBatch</span></code> to support batched sampling</p></li>
<li><p>Enabled FMHA for models in BART, Whisper, and NMT family</p></li>
<li><p>Removed router tensor parallelism to improve performance for MoE models</p></li>
<li><p>Improved custom all-reduce kernel</p></li>
</ul>
</li>
<li><p>Infrastructure features</p>
<ul>
<li><p>Base Docker image for TensorRT-LLM is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/pytorch:24.02-py3</span></code></p></li>
<li><p>The dependent PyTorch version is updated to 2.2</p></li>
<li><p>Base Docker image for TensorRT-LLM backend is updated to <code class="docutils literal notranslate"><span class="pre">nvcr.io/nvidia/tritonserver:24.02-py3</span></code></p></li>
<li><p>The dependent CUDA version is updated to 12.3.2 (12.3 Update 2)</p></li>
</ul>
</li>
</ul>
</section>
<section id="id26">
<h3>API Changes<a class="headerlink" href="#id26" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Added C++ <code class="docutils literal notranslate"><span class="pre">executor</span></code> API</p></li>
<li><p>Added Python bindings</p></li>
<li><p>Added advanced and multi-GPU examples for Python binding of <code class="docutils literal notranslate"><span class="pre">executor</span></code> C++ API</p></li>
<li><p>Added documents for C++ <code class="docutils literal notranslate"><span class="pre">executor</span></code> API</p></li>
<li><p>Migrated Mixtral to high-level API and unified builder workflow</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Moved LLaMA convert checkpoint script from examples directory into the core library</p></li>
<li><p>Added support for <code class="docutils literal notranslate"><span class="pre">LLM()</span></code> API to accept engines built by <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Removed the <code class="docutils literal notranslate"><span class="pre">model</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code> and <code class="docutils literal notranslate"><span class="pre">gptSessionBenchmark</span></code></p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Refactored GPT with unified building workflow</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Refactored the Qwen model to the unified build workflow</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Removed all the LoRA related flags from <code class="docutils literal notranslate"><span class="pre">convert_checkpoint.py</span></code> script and the checkpoint content to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command to generalize the feature better to more models</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Removed the <code class="docutils literal notranslate"><span class="pre">use_prompt_tuning</span></code> flag, options from the <code class="docutils literal notranslate"><span class="pre">convert_checkpoint.py</span></code> script, and the checkpoint content to generalize the feature better to more models. Use <code class="docutils literal notranslate"><span class="pre">trtllm-build</span> <span class="pre">--max_prompt_embedding_table_size</span></code> instead.</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Changed the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span> <span class="pre">--world_size</span></code> flag to the <code class="docutils literal notranslate"><span class="pre">--auto_parallel</span></code> flag. The option is used for auto parallel planner only.</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> <code class="docutils literal notranslate"><span class="pre">AsyncLLMEngine</span></code> is removed. The <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.GenerationExecutor</span></code> class is refactored to work with both explicitly launching with <code class="docutils literal notranslate"><span class="pre">mpirun</span></code> in the application level and accept an MPI communicator created by <code class="docutils literal notranslate"><span class="pre">mpi4py</span></code>.</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> <code class="docutils literal notranslate"><span class="pre">examples/server</span></code> are removed.</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Removed LoRA related parameters from the convert checkpoint scripts.</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Simplified Qwen convert checkpoint script.</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Reused the <code class="docutils literal notranslate"><span class="pre">QuantConfig</span></code> used in <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> tool to support broader quantization features.</p></li>
<li><p>Added support for TensorRT-LLM checkpoint as model input.</p></li>
<li><p>Refined <code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code> used in <code class="docutils literal notranslate"><span class="pre">LLM.generate</span></code> or <code class="docutils literal notranslate"><span class="pre">LLM.generate_async</span></code> APIs, with the support of beam search, a variety of penalties, and more features.</p></li>
<li><p>Added support for the <code class="docutils literal notranslate"><span class="pre">StreamingLLM</span></code> feature. Enable it by setting <code class="docutils literal notranslate"><span class="pre">LLM(streaming_llm=...)</span></code>.</p></li>
</ul>
</section>
<section id="id27">
<h3>Model Updates<a class="headerlink" href="#id27" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Added support for distil-whisper</p></li>
<li><p>Added support for HuggingFace StarCoder2</p></li>
<li><p>Added support for VILA</p></li>
<li><p>Added support for Smaug-72B-v0.1</p></li>
<li><p>Migrate BLIP-2 examples to <code class="docutils literal notranslate"><span class="pre">examples/multimodal</span></code></p></li>
</ul>
</section>
<section id="limitations">
<h3>Limitations<a class="headerlink" href="#limitations" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">openai-triton</span></code> examples are not supported on Windows.</p></li>
</ul>
</section>
<section id="id28">
<h3>Fixed Issues<a class="headerlink" href="#id28" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Fixed a weight-only quant bug for Whisper to make sure that the <code class="docutils literal notranslate"><span class="pre">encoder_input_len_range</span></code> is not <code class="docutils literal notranslate"><span class="pre">0</span></code>. (#992)</p></li>
<li><p>Fixed an issue that log probabilities in Python runtime are not returned. (#983)</p></li>
<li><p>Multi-GPU fixes for multimodal examples. (#1003)</p></li>
<li><p>Fixed a wrong <code class="docutils literal notranslate"><span class="pre">end_id</span></code> issue for Qwen. (#987)</p></li>
<li><p>Fixed a non-stopping generation issue. (#1118, #1123)</p></li>
<li><p>Fixed a wrong link in <code class="docutils literal notranslate"><span class="pre">examples/mixtral/README.md</span></code>. (#1181)</p></li>
<li><p>Fixed LLaMA2-7B bad results when INT8 kv cache and per-channel INT8 weight only are enabled. (#967)</p></li>
<li><p>Fixed a wrong <code class="docutils literal notranslate"><span class="pre">head_size</span></code> when importing a Gemma model from HuggingFace Hub. (#1148)</p></li>
<li><p>Fixed ChatGLM2-6B building failure on INT8. (#1239)</p></li>
<li><p>Fixed a wrong relative path in Baichuan documentation. (#1242)</p></li>
<li><p>Fixed a wrong <code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code> tensor in <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code>. (#1183)</p></li>
<li><p>Fixed an error when converting SmoothQuant LLaMA. (#1267)</p></li>
<li><p>Fixed an issue that <code class="docutils literal notranslate"><span class="pre">examples/run.py</span></code> only load one line from <code class="docutils literal notranslate"><span class="pre">--input_file</span></code>.</p></li>
<li><p>Fixed an issue that <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code> does not transfer <code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code> tensor fields correctly. (#1183)</p></li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-8-0">
<h2>TensorRT-LLM Release 0.8.0<a class="headerlink" href="#tensorrt-llm-release-0-8-0" title="Link to this heading"></a></h2>
<section id="id29">
<h3>Key Features and Enhancements<a class="headerlink" href="#id29" title="Link to this heading"></a></h3>
<ul>
<li><p>Chunked context support (see docs/source/advanced/gpt-attention.md#chunked-context)</p></li>
<li><p>LoRA support for C++ runtime (see docs/source/lora.md)</p></li>
<li><p>Medusa decoding support (see examples/medusa/README.md)</p>
<ul class="simple">
<li><p>The support is limited to Python runtime for Ampere or newer GPUs with fp16 and bf16 accuracy, and the <code class="docutils literal notranslate"><span class="pre">temperature</span></code> parameter of sampling configuration should be 0</p></li>
</ul>
</li>
<li><p>StreamingLLM support for LLaMA (see docs/source/advanced/gpt-attention.md#streamingllm)</p></li>
<li><p>Support for batch manager to return logits from context and/or generation phases</p>
<ul class="simple">
<li><p>Include support in the Triton backend</p></li>
</ul>
</li>
<li><p>Support AWQ and GPTQ for QWEN</p></li>
<li><p>Support ReduceScatter plugin</p></li>
<li><p>Support for combining <code class="docutils literal notranslate"><span class="pre">repetition_penalty</span></code> and <code class="docutils literal notranslate"><span class="pre">presence_penalty</span></code> #274</p></li>
<li><p>Support for <code class="docutils literal notranslate"><span class="pre">frequency_penalty</span></code> #275</p></li>
<li><p>OOTB functionality support:</p>
<ul class="simple">
<li><p>Baichuan</p></li>
<li><p>InternLM</p></li>
<li><p>Qwen</p></li>
<li><p>BART</p></li>
</ul>
</li>
<li><p>LLaMA</p>
<ul class="simple">
<li><p>Support enabling INT4-AWQ along with FP8 KV Cache</p></li>
<li><p>Support BF16 for weight-only plugin</p></li>
</ul>
</li>
<li><p>Baichuan</p>
<ul class="simple">
<li><p>P-tuning support</p></li>
<li><p>INT4-AWQ and INT4-GPTQ support</p></li>
</ul>
</li>
<li><p>Decoder iteration-level profiling improvements</p></li>
<li><p>Add <code class="docutils literal notranslate"><span class="pre">masked_select</span></code> and <code class="docutils literal notranslate"><span class="pre">cumsum</span></code> function for modeling</p></li>
<li><p>Smooth Quantization support for ChatGLM2-6B / ChatGLM3-6B / ChatGLM2-6B-32K</p></li>
<li><p>Add Weight-Only Support To Whisper #794, thanks to the contribution from &#64;Eddie-Wang1120</p></li>
<li><p>Support FP16 fMHA on NVIDIA V100 GPU</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Some features are not enabled for all models listed in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples">examples</a> folder.</p>
</div>
</li>
</ul>
</section>
<section id="id30">
<h3>Model Updates<a class="headerlink" href="#id30" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Phi-1.5/2.0</p></li>
<li><p>Mamba support (see examples/mamba/README.md)</p>
<ul>
<li><p>The support is limited to beam width = 1 and single-node single-GPU</p></li>
</ul>
</li>
<li><p>Nougat support (see examples/multimodal/README.md#nougat)</p></li>
<li><p>Qwen-VL support (see examples/qwenvl/README.md)</p></li>
<li><p>RoBERTa support, thanks to the contribution from &#64;erenup</p></li>
<li><p>Skywork model support</p></li>
<li><p>Add example for multimodal models (BLIP with OPT or T5, LlaVA)</p></li>
</ul>
<p>Refer to the <a class="reference internal" href="reference/support-matrix.html#support-matrix-software"><span class="std std-ref">Software</span></a> section for a list of supported models.</p>
<ul class="simple">
<li><p>API</p>
<ul>
<li><p>Add a set of High-level APIs for end-to-end generation tasks (see examples/high-level-api/README.md)</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Migrate models to the new build workflow, including LLaMA, Mistral, Mixtral, InternLM, ChatGLM, Falcon, GPT-J, GPT-NeoX, Medusa, MPT, Baichuan and Phi (see docs/source/new_workflow.md)</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Deprecate <code class="docutils literal notranslate"><span class="pre">LayerNorm</span></code> and <code class="docutils literal notranslate"><span class="pre">RMSNorm</span></code> plugins and removed corresponding build parameters</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Remove optional parameter <code class="docutils literal notranslate"><span class="pre">maxNumSequences</span></code> for GPT manager</p></li>
</ul>
</li>
<li><p>Fixed Issues</p>
<ul>
<li><p>Fix the first token being abnormal issue when <code class="docutils literal notranslate"><span class="pre">--gather_all_token_logits</span></code> is enabled #639</p></li>
<li><p>Fix LLaMA with LoRA enabled build failure #673</p></li>
<li><p>Fix InternLM SmoothQuant build failure #705</p></li>
<li><p>Fix Bloom int8_kv_cache functionality #741</p></li>
<li><p>Fix crash in <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code> #649</p></li>
<li><p>Fix Blip2 build error #695</p></li>
<li><p>Add pickle support for <code class="docutils literal notranslate"><span class="pre">InferenceRequest</span></code> #701</p></li>
<li><p>Fix Mixtral-8x7b build failure with custom_all_reduce #825</p></li>
<li><p>Fix INT8 GEMM shape #935</p></li>
<li><p>Minor bug fixes</p></li>
</ul>
</li>
<li><p>Performance</p>
<ul>
<li><p><strong>[BREAKING CHANGES]</strong> Increase default <code class="docutils literal notranslate"><span class="pre">freeGpuMemoryFraction</span></code> parameter from 0.85 to 0.9 for higher throughput</p></li>
<li><p><strong>[BREAKING CHANGES]</strong> Disable <code class="docutils literal notranslate"><span class="pre">enable_trt_overlap</span></code> argument for GPT manager by default</p></li>
<li><p>Performance optimization of beam search kernel</p></li>
<li><p>Add bfloat16 and paged kv cache support for optimized generation MQA/GQA kernels</p></li>
<li><p>Custom AllReduce plugins performance optimization</p></li>
<li><p>Top-P sampling performance optimization</p></li>
<li><p>LoRA performance optimization</p></li>
<li><p>Custom allreduce performance optimization by introducing a ping-pong buffer to avoid an extra synchronization cost</p></li>
<li><p>Integrate XQA kernels for GPT-J (beamWidth=4)</p></li>
</ul>
</li>
<li><p>Documentation</p>
<ul>
<li><p>Batch manager arguments documentation updates</p></li>
<li><p>Add documentation for best practices for tuning the performance of TensorRT-LLM (See docs/source/perf_best_practices.md)</p></li>
<li><p>Add documentation for Falcon AWQ support (See examples/falcon/README.md)</p></li>
<li><p>Update to the <code class="docutils literal notranslate"><span class="pre">docs/source/new_workflow.md</span></code> documentation</p></li>
<li><p>Update AWQ INT4 weight only quantization documentation for GPT-J</p></li>
<li><p>Add blog: Speed up inference with SOTA quantization techniques in TRT-LLM</p></li>
<li><p>Refine TensorRT-LLM backend README structure #133</p></li>
<li><p>Typo fix #739</p></li>
</ul>
</li>
</ul>
</section>
</section>
<section id="tensorrt-llm-release-0-7-1">
<h2>TensorRT-LLM Release 0.7.1<a class="headerlink" href="#tensorrt-llm-release-0-7-1" title="Link to this heading"></a></h2>
<section id="id31">
<h3>Key Features and Enhancements<a class="headerlink" href="#id31" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Speculative decoding (preview)</p></li>
<li><p>Added a Python binding for <code class="docutils literal notranslate"><span class="pre">GptManager</span></code></p></li>
<li><p>Added a Python class <code class="docutils literal notranslate"><span class="pre">ModelRunnerCpp</span></code> that wraps C++ <code class="docutils literal notranslate"><span class="pre">gptSession</span></code></p></li>
<li><p>System prompt caching</p></li>
<li><p>Enabled split-k for weight-only cutlass kernels</p></li>
<li><p>FP8 KV cache support for XQA kernel</p></li>
<li><p>New Python builder API and <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command (already applied to <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/blip2">blip2</a> and <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/opt#3-build-tensorrt-engines">OPT</a>)</p></li>
<li><p>Support <code class="docutils literal notranslate"><span class="pre">StoppingCriteria</span></code> and <code class="docutils literal notranslate"><span class="pre">LogitsProcessor</span></code> in Python generate API</p></li>
<li><p>FHMA support for chunked attention and paged KV cache</p></li>
<li><p>Performance enhancements include:</p>
<ul>
<li><p>MMHA optimization for MQA and GQA</p></li>
<li><p>LoRA optimization: cutlass grouped GEMM</p></li>
<li><p>Optimize Hopper warp specialized kernels</p></li>
<li><p>Optimize <code class="docutils literal notranslate"><span class="pre">AllReduce</span></code> for parallel attention on Falcon and GPT-J</p></li>
<li><p>Enable split-k for weight-only cutlass kernel when SM&gt;=75</p></li>
</ul>
</li>
<li><p>Added <span class="xref std std-ref">workflow</span> documentation</p></li>
</ul>
</section>
<section id="id32">
<h3>Model Updates<a class="headerlink" href="#id32" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>BART and mBART support in encoder-decoder models</p></li>
<li><p>FairSeq Neural Machine Translation (NMT) family</p></li>
<li><p>Mixtral-8x7B model</p></li>
<li><p>Support weight loading for HuggingFace Mixtral model</p></li>
<li><p>OpenAI Whisper</p></li>
<li><p>Mixture of Experts support</p></li>
<li><p>MPT - Int4 AWQ / SmoothQuant support</p></li>
<li><p>Baichuan FP8 quantization support</p></li>
</ul>
</section>
<section id="id33">
<h3>Fixed Issues<a class="headerlink" href="#id33" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>Fixed tokenizer usage in <code class="docutils literal notranslate"><span class="pre">quantize.py</span></code> <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/issues/288">#288</a></p></li>
<li><p>Fixed LLaMa with LoRA error</p></li>
<li><p>Fixed LLaMA GPTQ failure</p></li>
<li><p>Fixed Python binding for InferenceRequest issue</p></li>
<li><p>Fixed CodeLlama SQ accuracy issue</p></li>
</ul>
</section>
<section id="id34">
<h3>Known Issues<a class="headerlink" href="#id34" title="Link to this heading"></a></h3>
<ul class="simple">
<li><p>The hang reported in issue <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend/issues/149">#149</a> has not been reproduced by the TensorRT-LLM team. If it is caused by a bug in TensorRT-LLM, that bug may be present in that release.</p></li>
</ul>
</section>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="key-features.html" class="btn btn-neutral float-left" title="Key Features" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="installation/linux.html" class="btn btn-neutral float-right" title="Installing on Linux" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7fc13f367f40>
<div class="footer">
<p>
Copyright © 2024 NVIDIA Corporation
</p>
<p>
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
data-cms-ai="0">Privacy Policy</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
data-cms-ai="0">Manage My Privacy</a> |
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
data-cms-ai="0">Do Not Sell or Share My Data</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
rel="noopener" data-cms-ai="0">Terms of Service</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
data-cms-ai="0">Accessibility</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
data-cms-ai="0">Product Security</a> |
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
data-cms-ai="0">Contact</a>
</p>
</div>
</div>
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>