mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
436 lines
26 KiB
HTML
436 lines
26 KiB
HTML
|
||
|
||
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="../">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>Speed up inference with SOTA quantization techniques in TRT-LLM — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
||
|
||
|
||
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="../_static/doctools.js?v=888ff710"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../_static/copybutton.js?v=65e89d2a"></script>
|
||
<script src="../_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget" href="XQA-kernel.html" />
|
||
<link rel="prev" title="Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100" href="Falcon180B-H200.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="../index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-best-practices.html">Best Practices</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Speed up inference with SOTA quantization techniques in TRT-LLM</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#quantization-in-tensorrt-llm">Quantization in TensorRT-LLM</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#benchmark">Benchmark</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#performance">Performance</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#accuracy">Accuracy</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#best-practices-to-choose-the-right-quantization-methods">Best practices to choose the right quantization methods</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#whats-coming-next">What’s coming next</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="../index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">Speed up inference with SOTA quantization techniques in TRT-LLM</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="../_sources/blogs/quantization-in-TRT-LLM.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<section id="speed-up-inference-with-sota-quantization-techniques-in-trt-llm">
|
||
<h1>Speed up inference with SOTA quantization techniques in TRT-LLM<a class="headerlink" href="#speed-up-inference-with-sota-quantization-techniques-in-trt-llm" title="Link to this heading"></a></h1>
|
||
<p>The deployment and inference speed of LLMs are often impeded by limitations in memory capacity, memory bandwidth, and computation power. Quantization emerges as a vital strategy to address these bottlenecks, involving representing weights and activations with lower-precision data types like <a class="reference external" href="https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s52166/">FP8</a>.</p>
|
||
<p>In this blog, we provide an overview of the quantization features in TensorRT-LLM, share benchmark, and offer best practices of selecting the appropriate quantization methods tailored to your specific use case.</p>
|
||
<section id="quantization-in-tensorrt-llm">
|
||
<h2>Quantization in TensorRT-LLM<a class="headerlink" href="#quantization-in-tensorrt-llm" title="Link to this heading"></a></h2>
|
||
<p>TensorRT-LLM offers a best-in-class unified quantization toolkit to significantly speedup DL/GenAI deployment on NVIDIA hardware, while maintaining model accuracy. This toolkit is designed with easy-of-use in mind. You can follow <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/quantization">this user guide</a> to quantize <a class="reference internal" href="../reference/support-matrix.html#models"><span class="std std-ref">supported LLMs</span></a> with a few lines of codes. We currently focus on providing SOTA <strong>Post-Training Quantization (PTQ)</strong> and will soon expand to more model optimization techniques in the near future.</p>
|
||
</section>
|
||
<section id="benchmark">
|
||
<h2>Benchmark<a class="headerlink" href="#benchmark" title="Link to this heading"></a></h2>
|
||
<section id="performance">
|
||
<h3>Performance<a class="headerlink" href="#performance" title="Link to this heading"></a></h3>
|
||
<p>In the following benchmark, we highlight the acceleration of a few popular models at a small batch size without imposing latency constraints. It’s important to note that in scenarios where there’s a latency constraint in your application, TRT-LLM can achieve an even greater performance improvement. Using LLaMA-v2-7B as an example, when the first token latency is constrained to be under 500ms, quantization with FP8 and a batch size of 16 achieves a notable <strong>2.3x inference speedup</strong> compared to FP16 on a H100.</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Model</p></th>
|
||
<th class="head text-center"><p>Batch Size</p></th>
|
||
<th class="head text-center"><p>Speedup (FP8 v.s. FP16)</p></th>
|
||
<th class="head text-center"><p>Speedup (INT8 SQ v.s. FP16)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>GPT-J</p></td>
|
||
<td class="text-center"><p>1</p></td>
|
||
<td class="text-center"><p>1.40x</p></td>
|
||
<td class="text-center"><p>1.40x</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>GPT-J</p></td>
|
||
<td class="text-center"><p>8</p></td>
|
||
<td class="text-center"><p>1.44x</p></td>
|
||
<td class="text-center"><p>1.30x</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>LLaMA-v2-7B</p></td>
|
||
<td class="text-center"><p>1</p></td>
|
||
<td class="text-center"><p>1.51x</p></td>
|
||
<td class="text-center"><p>1.47x</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA-v2-7B</p></td>
|
||
<td class="text-center"><p>8</p></td>
|
||
<td class="text-center"><p>1.40x</p></td>
|
||
<td class="text-center"><p>1.32x</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>*The above benchmarks were run with Input Length=1024, Output Length=128, and TP=1 on H100 80GB.</p>
|
||
</section>
|
||
<section id="accuracy">
|
||
<h3>Accuracy<a class="headerlink" href="#accuracy" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Model</p></th>
|
||
<th class="head text-center"><p>Quantization Methods</p></th>
|
||
<th class="head text-center"><p>MMLU Baseline (FP16)</p></th>
|
||
<th class="head text-center"><p>MMLU Post-quantization</p></th>
|
||
<th class="head text-center"><p>MMLU Loss</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>Falcon-180B</p></td>
|
||
<td class="text-center"><p>FP8</p></td>
|
||
<td class="text-center"><p>70.4</p></td>
|
||
<td class="text-center"><p>70.3</p></td>
|
||
<td class="text-center"><p>0.14%</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td class="text-center"><p>INT8-SQ</p></td>
|
||
<td class="text-center"><p>70.4</p></td>
|
||
<td class="text-center"><p>68.6</p></td>
|
||
<td class="text-center"><p>2.56%</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td class="text-center"><p>INT4-AWQ</p></td>
|
||
<td class="text-center"><p>70.4</p></td>
|
||
<td class="text-center"><p>69.8</p></td>
|
||
<td class="text-center"><p>0.85%</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>Falcon-40B</p></td>
|
||
<td class="text-center"><p>FP8</p></td>
|
||
<td class="text-center"><p>56.1</p></td>
|
||
<td class="text-center"><p>55.6</p></td>
|
||
<td class="text-center"><p>0.89%</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td class="text-center"><p>INT8-SQ</p></td>
|
||
<td class="text-center"><p>56.1</p></td>
|
||
<td class="text-center"><p>54.7</p></td>
|
||
<td class="text-center"><p>2.50%</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td class="text-center"><p>INT4-AWQ</p></td>
|
||
<td class="text-center"><p>56.1</p></td>
|
||
<td class="text-center"><p>55.5</p></td>
|
||
<td class="text-center"><p>1.07%</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>LLaMA-v2-70B</p></td>
|
||
<td class="text-center"><p>FP8</p></td>
|
||
<td class="text-center"><p>69.1</p></td>
|
||
<td class="text-center"><p>68.5</p></td>
|
||
<td class="text-center"><p>0.87%</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td class="text-center"><p>INT8-SQ</p></td>
|
||
<td class="text-center"><p>69.1</p></td>
|
||
<td class="text-center"><p>67.2</p></td>
|
||
<td class="text-center"><p>2.75%</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td class="text-center"><p>INT4-AWQ</p></td>
|
||
<td class="text-center"><p>69.1</p></td>
|
||
<td class="text-center"><p>68.4</p></td>
|
||
<td class="text-center"><p>1.01%</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>MPT-30B</p></td>
|
||
<td class="text-center"><p>FP8</p></td>
|
||
<td class="text-center"><p>47.5</p></td>
|
||
<td class="text-center"><p>47.4</p></td>
|
||
<td class="text-center"><p>0.21%</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td class="text-center"><p>INT8-SQ</p></td>
|
||
<td class="text-center"><p>47.5</p></td>
|
||
<td class="text-center"><p>46.8</p></td>
|
||
<td class="text-center"><p>1.47%</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td class="text-center"><p>INT4-AWQ</p></td>
|
||
<td class="text-center"><p>47.5</p></td>
|
||
<td class="text-center"><p>46.5</p></td>
|
||
<td class="text-center"><p>2.11%</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
</section>
|
||
<section id="best-practices-to-choose-the-right-quantization-methods">
|
||
<h2>Best practices to choose the right quantization methods<a class="headerlink" href="#best-practices-to-choose-the-right-quantization-methods" title="Link to this heading"></a></h2>
|
||
<p>A quantization method comprises three primary components:</p>
|
||
<ol class="arabic simple">
|
||
<li><p>Weight precision format</p></li>
|
||
<li><p>Activation precision format</p></li>
|
||
<li><p>Calibration algorithms</p></li>
|
||
</ol>
|
||
<p>Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it’s recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model specific. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ.</p>
|
||
<p>Based on specific use cases, users might have different tolerances on accuracy impact and calibration time. The table below summarizes the tradeoffs* to consider when choosing a quantization method. You can also learn more about precision formats in our <a class="reference external" href="https://nvidia.github.io/TensorRT-LLM/precision.html">documentation</a>.</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Quantization Methods</p></th>
|
||
<th class="head text-center"><p>Performance Improvement (batch size <= 4)</p></th>
|
||
<th class="head text-center"><p>Performance Improvement (batch size >= 16)</p></th>
|
||
<th class="head text-center"><p>Accuracy Impact</p></th>
|
||
<th class="head text-center"><p>Calibration Time**</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>FP8 (W8A8)</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Very Low</p></td>
|
||
<td class="text-center"><p>Minutes</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Int8 SQ (W8A8)</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Minutes</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Int8 weight-only (W8A16)</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Not Required</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Int4 weight-only (W4A16)</p></td>
|
||
<td class="text-center"><p>High</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>High</p></td>
|
||
<td class="text-center"><p>Not Required</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Int4 AWQ (W4A16)</p></td>
|
||
<td class="text-center"><p>High</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Tens of Minutes</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Int4 GPTQ</p></td>
|
||
<td class="text-center"><p>High</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Tens of Minutes</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Int4-FP8 AWQ (W4A8)</p></td>
|
||
<td class="text-center"><p>High</p></td>
|
||
<td class="text-center"><p>Medium</p></td>
|
||
<td class="text-center"><p>Low</p></td>
|
||
<td class="text-center"><p>Tens of Minutes</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>* The performance and impact are measured on 10+ popular LLMs. We’ll follow up with more data points.
|
||
** Calibration time is subject to the actual model size.</p>
|
||
<p>We note that TensorRT-LLM also offers INT8 and FP8 quantization for KV cache. KV cache differs from normal activation because it occupies non-negligible persistent memory under scenarios like large batch sizes or long context lengths. If you’re using KV cache on Hopper & Ada GPUs, We recommend using FP8 KV cache over Int8 because the former has a lower accuracy impact than the latter in most tested cases. When switching from FP16 KV cache to FP8 KV cache, it also enables you to run 2-3x larger batch size on H100 machine for models like GPT-J which further brings about 1.5x performance benefit.</p>
|
||
</section>
|
||
<section id="whats-coming-next">
|
||
<h2>What’s coming next<a class="headerlink" href="#whats-coming-next" title="Link to this heading"></a></h2>
|
||
<p>TensorRT-LLM continues to make improvements on our quantization features, such as Int4-FP8 AWQ (W4A8) public examples and more model supports. Please stay tuned for our upcoming releases.</p>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="Falcon180B-H200.html" class="btn btn-neutral float-left" title="Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="XQA-kernel.html" class="btn btn-neutral float-right" title="New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<jinja2.runtime.BlockReference object at 0x7fed9c6c5be0>
|
||
|
||
<div class="footer">
|
||
<p>
|
||
Copyright © 2024 NVIDIA Corporation
|
||
</p>
|
||
<p>
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Privacy Policy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Manage My Privacy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Accessibility</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Product Security</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Contact</a>
|
||
</p>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |