mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-31 08:11:27 +08:00
1303 lines
75 KiB
HTML
1303 lines
75 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="../">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>Overview — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=19f00094" />
|
||
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="../_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="../_static/doctools.js?v=888ff710"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="Best Practices for Tuning the Performance of TensorRT-LLM" href="perf-best-practices.html" />
|
||
<link rel="prev" title="Expert Parallelism in TensorRT-LLM" href="../advanced/expert-parallelism.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="../index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/batch-manager.html">The Batch Manager in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Overview</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#throughput-measurements">Throughput Measurements</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#reproducing-benchmarked-results">Reproducing Benchmarked Results</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#building-the-tensorrt-llm-container">Building the TensorRT-LLM Container</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#methodology">Methodology</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#commands">Commands</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#variables">Variables</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#engine-building">Engine Building</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#engine-configuration-files">Engine Configuration Files</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#network-configuration-files">Network Configuration Files</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#running-on-a100">Running on A100</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#preparing-a-dataset">Preparing a Dataset</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#running-the-benchmark">Running the Benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="perf-best-practices.html">Best Practices for Tuning the Performance of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="perf-analysis.html">Performance Analysis</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="../index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">Overview</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="../_sources/performance/perf-overview.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<blockquote id="perf-overview">
|
||
<div><p>[!IMPORTANT]
|
||
As of TensorRT-LLM v0.10, these performance benchmarks have changed methodology to utilize in-flight batching and
|
||
no longer utilize static benchmarking. These numbers are initial measurements and are expected to improve in future
|
||
releases.</p>
|
||
</div></blockquote>
|
||
<section id="overview">
|
||
<h1>Overview<a class="headerlink" href="#overview" title="Link to this heading"></a></h1>
|
||
<p>This document summarizes performance measurements of TensorRT-LLM on H100
|
||
(Hopper), L40S (Ada) and A100 (Ampere) GPUs for a few key models.</p>
|
||
<p>The data in the following tables is provided as a reference point to help users
|
||
validate observed performance. It should not be considered as the peak
|
||
performance that can be delivered by TensorRT-LLM.</p>
|
||
<section id="throughput-measurements">
|
||
<h2>Throughput Measurements<a class="headerlink" href="#throughput-measurements" title="Link to this heading"></a></h2>
|
||
<p>The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
|
||
and shows the throughput client-server scenario under maximum load.</p>
|
||
<p>The performance numbers below were collected using the steps described in this document.</p>
|
||
<p><strong>All data in the table below was generated using version 0.10.0 and presents token throughput in tokens/second.</strong></p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p><strong>GPU</strong></p></td>
|
||
<td><p>H200 141GB HBM3</p></td>
|
||
<td><p>H100 80GB HBM3</p></td>
|
||
<td><p>H100 80GB HBM3</p></td>
|
||
<td><p>A100-SXM4-80GB</p></td>
|
||
<td><p>L40S</p></td>
|
||
<td><p>L40S</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p><strong>Precision</strong></p></td>
|
||
<td><p>FP8</p></td>
|
||
<td><p>FP8</p></td>
|
||
<td><p>FP16</p></td>
|
||
<td><p>FP16</p></td>
|
||
<td><p>FP8</p></td>
|
||
<td><p>FP16</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p><strong>Model</strong></p></td>
|
||
<td><p><strong>Input/Output Lengths</strong></p></td>
|
||
<td><p><strong>TP</strong></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>GPTJ 6B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>40633.96</p></td>
|
||
<td><p>34955.29</p></td>
|
||
<td><p>11206.68</p></td>
|
||
<td><p>5966.69</p></td>
|
||
<td><p>6997.91</p></td>
|
||
<td><p>3448.53</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>2937.91</p></td>
|
||
<td><p>2800.37</p></td>
|
||
<td><p>1354.56</p></td>
|
||
<td><p>682.27</p></td>
|
||
<td><p>747.43</p></td>
|
||
<td><p>352.4</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>9039.72</p></td>
|
||
<td><p>54939.48</p></td>
|
||
<td><p>3896.8</p></td>
|
||
<td><p>2225.09</p></td>
|
||
<td><p>2041.52</p></td>
|
||
<td><p>896.04</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>5437.97</p></td>
|
||
<td><p>3663.26</p></td>
|
||
<td><p>1498.04</p></td>
|
||
<td><p>882.61</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v2 7B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>18229.3</p></td>
|
||
<td><p>16985.6</p></td>
|
||
<td><p>10725.31</p></td>
|
||
<td><p>5303.5</p></td>
|
||
<td><p>6121.1</p></td>
|
||
<td><p>3139.62</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>2496.92</p></td>
|
||
<td><p>2355.47</p></td>
|
||
<td><p>1235.4</p></td>
|
||
<td><p>585.6</p></td>
|
||
<td><p>642.24</p></td>
|
||
<td><p>311.82</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>7612.25</p></td>
|
||
<td><p>6679.36</p></td>
|
||
<td><p>3399.43</p></td>
|
||
<td><p>1903.4</p></td>
|
||
<td><p>1749.4</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3259.74</p></td>
|
||
<td><p>2805.32</p></td>
|
||
<td><p>1335.51</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v3 8B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>16708.84</p></td>
|
||
<td><p>16708.53</p></td>
|
||
<td><p>12085.78</p></td>
|
||
<td><p>5853.96</p></td>
|
||
<td><p>8273.8</p></td>
|
||
<td><p>5207.01</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>2478.94</p></td>
|
||
<td><p>2427.09</p></td>
|
||
<td><p>1604.7</p></td>
|
||
<td><p>737.81</p></td>
|
||
<td><p>1021.64</p></td>
|
||
<td><p>622.15</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>8367.88</p></td>
|
||
<td><p>8013.55</p></td>
|
||
<td><p>6208.23</p></td>
|
||
<td><p>3385.71</p></td>
|
||
<td><p>4568.17</p></td>
|
||
<td><p>2134.72</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3674.33</p></td>
|
||
<td><p>3500.48</p></td>
|
||
<td><p>2776.31</p></td>
|
||
<td><p>1514.04</p></td>
|
||
<td><p>1546.84</p></td>
|
||
<td><p>899.2</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>Mixtral 8x7B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>16959.49</p></td>
|
||
<td><p>16051.88</p></td>
|
||
<td><p>12376.52</p></td>
|
||
<td><p>5120.41</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>5271.48</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>2423.99</p></td>
|
||
<td><p>2276.6</p></td>
|
||
<td><p>1717.37</p></td>
|
||
<td><p>636.5</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>654.36</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>12944.52</p></td>
|
||
<td><p>11997.24</p></td>
|
||
<td><p>7864.88</p></td>
|
||
<td><p>3946.92</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4650.16</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>6208.97</p></td>
|
||
<td><p>5498.33</p></td>
|
||
<td><p>3722.56</p></td>
|
||
<td><p>1834.36</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2262.57</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v2 70B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>4055.97</p></td>
|
||
<td><p>2134.52</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>6299.21</p></td>
|
||
<td><p>6035.36</p></td>
|
||
<td><p></p></td>
|
||
<td><p>963.14</p></td>
|
||
<td><p>980.31</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>8758.45</p></td>
|
||
<td><p>8148.67</p></td>
|
||
<td><p>5454.76</p></td>
|
||
<td><p>2394.12</p></td>
|
||
<td><p>1450.61</p></td>
|
||
<td><p>838.03</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>10261.44</p></td>
|
||
<td><p>9385.26</p></td>
|
||
<td><p>7491.94</p></td>
|
||
<td><p>3683.42</p></td>
|
||
<td><p>1387.91</p></td>
|
||
<td><p>1204.32</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>493.87</p></td>
|
||
<td><p>222.16</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>784.47</p></td>
|
||
<td><p>757.55</p></td>
|
||
<td><p></p></td>
|
||
<td><p>114.9</p></td>
|
||
<td><p>111.24</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>1164.15</p></td>
|
||
<td><p>1083.25</p></td>
|
||
<td><p>695.33</p></td>
|
||
<td><p>292.77</p></td>
|
||
<td><p>171.68</p></td>
|
||
<td><p>102.49</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>1441.26</p></td>
|
||
<td><p>1346.9</p></td>
|
||
<td><p>1016.58</p></td>
|
||
<td><p>456.46</p></td>
|
||
<td><p>163.76</p></td>
|
||
<td><p>145.41</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3199.9</p></td>
|
||
<td><p>635.32</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>6747</p></td>
|
||
<td><p>4710.45</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>10960.72</p></td>
|
||
<td><p>8485.56</p></td>
|
||
<td><p>3686.63</p></td>
|
||
<td><p>2047.67</p></td>
|
||
<td><p>1368.09</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>17250.73</p></td>
|
||
<td><p>12333.24</p></td>
|
||
<td><p>7927.16</p></td>
|
||
<td><p>4166.36</p></td>
|
||
<td><p>1667.57</p></td>
|
||
<td><p>1186.38</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>1734.58</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>3455.34</p></td>
|
||
<td><p>2267.45</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>6141.39</p></td>
|
||
<td><p>4019.31</p></td>
|
||
<td><p>1814.78</p></td>
|
||
<td><p>1046</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>9271.77</p></td>
|
||
<td><p>7061.32</p></td>
|
||
<td><p>3658.42</p></td>
|
||
<td><p>2210.84</p></td>
|
||
<td><p>771.23</p></td>
|
||
<td><p>614.74</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v3 70B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3988.96</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>6155.26</p></td>
|
||
<td><p>5835.57</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>8454.74</p></td>
|
||
<td><p>7945.64</p></td>
|
||
<td><p>5210.19</p></td>
|
||
<td><p>2405.44</p></td>
|
||
<td><p>1280.9</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>9893.18</p></td>
|
||
<td><p>9308.51</p></td>
|
||
<td><p>7126.51</p></td>
|
||
<td><p>3621.25</p></td>
|
||
<td><p>1367.56</p></td>
|
||
<td><p>1164.88</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>491.79</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>783.26</p></td>
|
||
<td><p>751.14</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>1154.66</p></td>
|
||
<td><p>1074.31</p></td>
|
||
<td><p>691.99</p></td>
|
||
<td><p>295.87</p></td>
|
||
<td><p>171.16</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>1434.86</p></td>
|
||
<td><p>1337.36</p></td>
|
||
<td><p>1010.5</p></td>
|
||
<td><p>455.18</p></td>
|
||
<td><p>165.06</p></td>
|
||
<td><p>143.92</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3015.16</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>6758.32</p></td>
|
||
<td><p>4130.4</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>10532.1</p></td>
|
||
<td><p>7730.54</p></td>
|
||
<td><p>3246.34</p></td>
|
||
<td><p>1974.04</p></td>
|
||
<td><p>1232.53</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>16467.79</p></td>
|
||
<td><p>11680.94</p></td>
|
||
<td><p>7205.34</p></td>
|
||
<td><p>4091.45</p></td>
|
||
<td><p>1514.93</p></td>
|
||
<td><p>1034.07</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>1654.25</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>3271.6</p></td>
|
||
<td><p>1976.76</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>6113.93</p></td>
|
||
<td><p>3685.74</p></td>
|
||
<td><p>1612.11</p></td>
|
||
<td><p>992.74</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>8986.3</p></td>
|
||
<td><p>6443.85</p></td>
|
||
<td><p>3523.17</p></td>
|
||
<td><p>2118.89</p></td>
|
||
<td><p>691.62</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>Falcon 180B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>3810.55</p></td>
|
||
<td><p>3698.71</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>5946.89</p></td>
|
||
<td><p>5608.59</p></td>
|
||
<td><p>3954.58</p></td>
|
||
<td><p>1754.14</p></td>
|
||
<td><p>1243.33</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>525.6</p></td>
|
||
<td><p>510.85</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>848.4</p></td>
|
||
<td><p>813.95</p></td>
|
||
<td><p>535.41</p></td>
|
||
<td><p>221.39</p></td>
|
||
<td><p>145.35</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>2883.67</p></td>
|
||
<td><p>2495.62</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>5388.34</p></td>
|
||
<td><p>4796.47</p></td>
|
||
<td><p>3051.89</p></td>
|
||
<td><p>1684.6</p></td>
|
||
<td><p>1359.42</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>1376.61</p></td>
|
||
<td><p>952.25</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>2495.66</p></td>
|
||
<td><p>2421.77</p></td>
|
||
<td><p>896.28</p></td>
|
||
<td><p></p></td>
|
||
<td><p>609.65</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><em>TP stands for Tensor Parallelism</em></p>
|
||
</section>
|
||
<section id="reproducing-benchmarked-results">
|
||
<h2>Reproducing Benchmarked Results<a class="headerlink" href="#reproducing-benchmarked-results" title="Link to this heading"></a></h2>
|
||
<section id="building-the-tensorrt-llm-container">
|
||
<h3>Building the TensorRT-LLM Container<a class="headerlink" href="#building-the-tensorrt-llm-container" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>In order to benchmark TensorRT-LLM, you will need to follow the <span class="xref myst">Quick Start</span>
|
||
build process to create a baseline container for building a wheel. Additionally, the development
|
||
container needs a copy of the source code to build the wheel and the benchmarking script. Create the
|
||
right build environment, use the following :</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/NVIDIA/TensorRT-LLM.git
|
||
<span class="nb">cd</span><span class="w"> </span>TensorRT-LLM
|
||
git<span class="w"> </span>submodule<span class="w"> </span>update<span class="w"> </span>--init<span class="w"> </span>--recursive
|
||
git<span class="w"> </span>lfs<span class="w"> </span>install
|
||
git<span class="w"> </span>lfs<span class="w"> </span>pull
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>build
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span>
|
||
</pre></div>
|
||
</div>
|
||
<blockquote>
|
||
<div><p>[!WARNING]
|
||
If you have elevated privileges on your system, then skip the <code class="docutils literal notranslate"><span class="pre">make</span> <span class="pre">-C</span> <span class="pre">docker</span> <span class="pre">run</span> <span class="pre">LOCAL_USER=1</span></code>
|
||
command above as it may make it so that you cannot access some required system libraries within the
|
||
container because the build forces your UID and GID to match those that are set for your non-elevated
|
||
user. There are cases where the container will be booted as root (i.e. on some SLURM systems with
|
||
the pyxis plugin) which will cause libraries to be missing.</p>
|
||
</div></blockquote>
|
||
<p>If you are benchmarking in a shared environment, you need to specify the GPU indices that you would
|
||
like the container to use, otherwise the Makefile defaults to loading the container with all GPUs on
|
||
the system. For example, if you only have the 4 higher indices of GPUs on your system you can
|
||
configure it using the following example:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">NV_GPU</span><span class="o">=</span><span class="m">0</span>,1,2,3
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">GPU_OPTS</span><span class="o">=</span><span class="s1">'--gpus \"device=${NV_GPU}\"'</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Additionally, if you’d like to mount external storage to access persistent storage, or previously
|
||
built engines, you can mount directories as follows (simply replace <code class="docutils literal notranslate"><span class="pre">source</span></code> and <code class="docutils literal notranslate"><span class="pre">destination</span></code> with
|
||
the appropriate paths):</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">DOCKER_RUN_ARGS</span><span class="o">=</span><span class="s2">"-v /source:/destination"</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Once the container starts, you’ll need to build the wheel and the benchmarking scripts. From the
|
||
code root (the default directory when the container is loaded), the following commands will build
|
||
the TensorRT-LLM wheel, install dependencies, and build the benchmark scripts:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>python3<span class="w"> </span>./scripts/build_wheel.py<span class="w"> </span>--benchmarks<span class="w"> </span>--trt_root<span class="w"> </span>/usr/local/tensorrt
|
||
pip<span class="w"> </span>install<span class="w"> </span>./build/tensorrt_llm*.whl
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="methodology">
|
||
<h2>Methodology<a class="headerlink" href="#methodology" title="Link to this heading"></a></h2>
|
||
<p>The following tables are references for commands that are used as part of the benchmarking process.</p>
|
||
<section id="commands">
|
||
<h3>Commands<a class="headerlink" href="#commands" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Stage</p></th>
|
||
<th class="head"><p>Description</p></th>
|
||
<th class="head"><p>Command</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#engine-building"><span class="xref myst">Build</span></a></p></td>
|
||
<td><p>Build a TensorRT-LLM engine</p></td>
|
||
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-build</span> <span class="pre">--model_config</span> <span class="pre">$model_cfg</span> <span class="pre">--strongly_typed</span> <span class="pre">--output_dir</span> <span class="pre">$engine_dir</span> <span class="pre">--max_batch_size</span> <span class="pre">2048</span> <span class="pre">--max_input_len</span> <span class="pre">2048</span> <span class="pre">--max_output_len</span> <span class="pre">4096</span> <span class="pre">--workers</span> <span class="pre">$tp_size</span> <span class="pre">--max_num_tokens</span> <span class="pre">2048</span> <span class="pre">--use_paged_context_fmha</span> <span class="pre">enable</span> <span class="pre">--multiple_profiles</span> <span class="pre">enable</span></code></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><a class="reference internal" href="#preparing-a-dataset"><span class="xref myst">Dataset</span></a></p></td>
|
||
<td><p>Create a synthetic dataset</p></td>
|
||
<td><p><code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/prepare_dataset.py</span> <span class="pre">--output=$dataset_file</span> <span class="pre">--tokenizer=$model_name</span> <span class="pre">token-norm-dist</span> <span class="pre">--num-requests=2000</span> <span class="pre">--input-mean=$isl</span> <span class="pre">--output-mean=$osl</span> <span class="pre">--input-stdev=0</span> <span class="pre">--output-stdev=0</span></code></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#running-the-benchmark"><span class="xref myst">Run</span></a></p></td>
|
||
<td><p>Run a benchmark with a dataset</p></td>
|
||
<td><p><code class="docutils literal notranslate"><span class="pre">mpirun</span> <span class="pre">-n</span> <span class="pre">$tp_size</span> <span class="pre">--allow-run-as-root</span> <span class="pre">--oversubscribe</span> <span class="pre">cpp/build/benchmarks/gptManagerBenchmark</span> <span class="pre">--engine_dir</span> <span class="pre">$engine_dir</span> <span class="pre">--type</span> <span class="pre">IFB</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span> <span class="pre">--scheduler_policy</span> <span class="pre">max_utilization</span> <span class="pre">--kv_cache_free_gpu_mem_fraction</span> <span class="pre">0.9</span> <span class="pre">--output_csv</span> <span class="pre">$results_csv</span> <span class="pre">--request_rate</span> <span class="pre">-1.0</span> <span class="pre">--enable_chunked_context</span> <span class="pre">--streaming</span> <span class="pre">--warm_up</span> <span class="pre">0</span></code></p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="variables">
|
||
<h3>Variables<a class="headerlink" href="#variables" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Name</p></th>
|
||
<th class="head"><p>Description</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$isl</span></code></p></td>
|
||
<td><p>Benchmark input sequence length.</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$osl</span></code></p></td>
|
||
<td><p>Benchmark output sequence length.</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></td>
|
||
<td><p>Number of GPUs to run the benchmark with</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$engine_dir</span></code></p></td>
|
||
<td><p>Location to store built engine file (can be deleted after running benchmarks).</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$model_cfg</span></code></p></td>
|
||
<td><p>Name of the model configuration JSON file to use for building.</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$model_name</span></code></p></td>
|
||
<td><p>HuggingFace model name eg. meta-llama/Llama-2-7b-hf or use the path to a local weights directory</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code></p></td>
|
||
<td><p>Location of the dataset file generated by <code class="docutils literal notranslate"><span class="pre">prepare_dataset.py</span></code></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$results_csv</span></code></p></td>
|
||
<td><p>Path to store end results to.</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="engine-building">
|
||
<h3>Engine Building<a class="headerlink" href="#engine-building" title="Link to this heading"></a></h3>
|
||
<p>All benchmarks were run using a single engine with a configuration that is capable of handling the
|
||
maximum sequence lengths encountered during benchmarking. For each benchmark, regardless of input/output
|
||
sequence length, you can reuse the single engine to run all tests. Each engine will be built with a paged
|
||
KV cache and in-flight batching enabled. For more information see the
|
||
<a class="reference internal" href="../overview.html#in-flight-batching-and-paged-attention"><span class="std std-ref">documentation about in-flight batching</span></a>.</p>
|
||
<p>In order to build an engine you will need to run the following command by specifying a configuration file
|
||
for the model that you would like to build (see <a class="reference internal" href="#network-configuration-files"><span class="xref myst">below</span></a>). The general build
|
||
command is as follows:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>trtllm-build<span class="w"> </span>--model_config<span class="w"> </span><span class="nv">$model_cfg</span><span class="w"> </span>--strongly_typed<span class="w"> </span>--output_dir<span class="w"> </span><span class="nv">$engine_dir</span><span class="w"> </span>--max_batch_size<span class="w"> </span><span class="m">2048</span><span class="w"> </span>--max_input_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span>--max_output_len<span class="w"> </span><span class="m">4096</span><span class="w"> </span>--workers<span class="w"> </span><span class="nv">$tp_size</span><span class="w"> </span>--max_num_tokens<span class="w"> </span><span class="m">2048</span><span class="w"> </span>--use_paged_context_fmha<span class="w"> </span><span class="nb">enable</span><span class="w"> </span>--multiple_profiles<span class="w"> </span><span class="nb">enable</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Some notes about the command:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">--workers</span></code> affects the number of threads that build the engine file and does not necessarily need to match
|
||
the TP size. Make sure to set the tensor parallelism in the <code class="docutils literal notranslate"><span class="pre">$model_cfg</span></code> JSON file. See <a class="reference internal" href="#network-configuration-files"><span class="xref myst">below</span></a></p></li>
|
||
<li><p>You can run benchmarks for datasets that fit within the bounds of the <code class="docutils literal notranslate"><span class="pre">max_input_len</span></code> and <code class="docutils literal notranslate"><span class="pre">max_output_len</span></code> parameters.</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="engine-configuration-files">
|
||
<h3>Engine Configuration Files<a class="headerlink" href="#engine-configuration-files" title="Link to this heading"></a></h3>
|
||
<p>In order to configure the TensorRT-LLM build process for benchmarking, you need to provide
|
||
<code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> a configuration file that specifies the following the network configuration, parallelism
|
||
mapping, and quantization options.</p>
|
||
<p>Below we document how to benchmark each model on an H100-HBM3-80GB system and reproduce the throughput
|
||
numbers we document on our [Performance section](#performance of-tensorrt-llm).</p>
|
||
<blockquote>
|
||
<div><p>[!Important]
|
||
In order to change the parallelism for a build, you need to modify the <code class="docutils literal notranslate"><span class="pre">mapping</span></code> dictionary in your configuration file. The settings
|
||
must conform to the following condition: <code class="docutils literal notranslate"><span class="pre">world_size</span> <span class="pre">==</span> <span class="pre">tp_size</span> <span class="pre">*</span> <span class="pre">pp_size</span></code>.</p>
|
||
</div></blockquote>
|
||
<blockquote>
|
||
<div><p>[!Note]
|
||
All configurations below are set to run utilizing FP8 by default. If you would like to run on an A100 system, see our notes about <a class="reference internal" href="#running-on-a100"><span class="xref myst">disabling FP8 quantization</span></a>.</p>
|
||
</div></blockquote>
|
||
</section>
|
||
<section id="network-configuration-files">
|
||
<h3>Network Configuration Files<a class="headerlink" href="#network-configuration-files" title="Link to this heading"></a></h3>
|
||
<p>Each network has its own configuration file. All networks are configured to run using FP8 quantization
|
||
by default.</p>
|
||
<table>
|
||
<tr>
|
||
<td> Model </td> <td> Configuration File (FP8) </td>
|
||
</tr>
|
||
<tr>
|
||
<td> EleutherAI/gpt-j-6b </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"GPTJForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">28</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">50400</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gptj"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"gelu_new"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"rotary_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> tiiuae/falcon-180B </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FalconForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"bfloat16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">232</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14848</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">65024</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"gelu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"use_parallel_embedding"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"embedding_sharding_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"share_embedding_table"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"bias"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"parallel_attention"</span><span class="p">:</span><span class="w"> </span><span class="kc">true</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"new_decoder_architecture"</span><span class="p">:</span><span class="w"> </span><span class="kc">true</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Llama-2-7b-hf </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">11008</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">10000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Llama-2-70b-hf </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">28672</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">10000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Meta-Llama-3-8B </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">128256</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">28672</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rope_theta"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Meta-Llama-3-70B </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">128256</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14336</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rope_theta"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> mistralai/Mixtral-8x7B-v0.1 </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"MixtralForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">32768</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"head_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">128</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"swiglu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14336</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"moe_num_experts"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"moe_top_k"</span><span class="p">:</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">1000000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rope_theta"</span><span class="p">:</span><span class="w"> </span><span class="mf">1000000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
</section>
|
||
<section id="running-on-a100">
|
||
<h3>Running on A100<a class="headerlink" href="#running-on-a100" title="Link to this heading"></a></h3>
|
||
<p>To run the benchmarks on A100, you will need to undefine or remove the following
|
||
quantization fields from each config json file, because FP8 computation is a feature in H100 and newer GPUs.</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="preparing-a-dataset">
|
||
<h2>Preparing a Dataset<a class="headerlink" href="#preparing-a-dataset" title="Link to this heading"></a></h2>
|
||
<p>In order to prepare a dataset, you can use the provided <a class="reference download internal" download="" href="../_downloads/ea8faa5e98124e92f96b66dc586fb429/prepare_dataset.py"><span class="xref download myst">script</span></a>.
|
||
To generate a synthetic dataset, run the following command:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>benchmarks/cpp/prepare_dataset.py<span class="w"> </span>--output<span class="o">=</span><span class="nv">$dataset_file</span><span class="w"> </span>--tokenizer<span class="o">=</span><span class="nv">$model_name</span><span class="w"> </span>token-norm-dist<span class="w"> </span>--num-requests<span class="o">=</span><span class="m">2000</span><span class="w"> </span>--input-mean<span class="o">=</span><span class="nv">$isl</span><span class="w"> </span>--output-mean<span class="o">=</span><span class="nv">$osl</span><span class="w"> </span>--input-stdev<span class="o">=</span><span class="m">0</span><span class="w"> </span>--output-stdev<span class="o">=</span><span class="m">0</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The command will generate a JSON file located at the path specified <code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code> where all requests are of the same
|
||
input/output sequence length combinations. The script works by using the tokenizer to retrieve the vocabulary size and
|
||
randomly sample token IDs from it to create entirely random sequences. In the command above, all requests will be uniform
|
||
because the standard deviations for both input and output sequences are set to 0.</p>
|
||
</section>
|
||
<section id="running-the-benchmark">
|
||
<h2>Running the Benchmark<a class="headerlink" href="#running-the-benchmark" title="Link to this heading"></a></h2>
|
||
<p>To run the benchmark with the generated data set, simply run the following command from the root of the
|
||
TensorRT-LLM repository. See the <a class="reference internal" href="#variables"><span class="xref myst">variables</span></a> section for reference on variable values.</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>mpirun<span class="w"> </span>-n<span class="w"> </span><span class="nv">$tp_size</span><span class="w"> </span>--allow-run-as-root<span class="w"> </span>--oversubscribe<span class="w"> </span>cpp/build/benchmarks/gptManagerBenchmark<span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_dir</span><span class="w"> </span>--type<span class="w"> </span>IFB<span class="w"> </span>--dataset<span class="w"> </span><span class="nv">$dataset_file</span><span class="w"> </span>--scheduler_policy<span class="w"> </span>max_utilization<span class="w"> </span>--kv_cache_free_gpu_mem_fraction<span class="w"> </span><span class="m">0</span>.9<span class="w"> </span>--output_csv<span class="w"> </span><span class="nv">$results_csv</span><span class="w"> </span>--request_rate<span class="w"> </span>-1.0<span class="w"> </span>--enable_chunked_context<span class="w"> </span>--streaming<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">0</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The command will run the <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code> binary that will report the throughput and other metrics as part of its output
|
||
that can be compared with the table in the <a class="reference internal" href="#peak-throughput"><span class="xref myst">Performance section</span></a> of this README.</p>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="../advanced/expert-parallelism.html" class="btn btn-neutral float-left" title="Expert Parallelism in TensorRT-LLM" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="perf-best-practices.html" class="btn btn-neutral float-right" title="Best Practices for Tuning the Performance of TensorRT-LLM" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<jinja2.runtime.BlockReference object at 0x7f270271fd00>
|
||
|
||
<div class="footer">
|
||
<p>
|
||
Copyright © 2024 NVIDIA Corporation
|
||
</p>
|
||
<p>
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Privacy Policy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Manage My Privacy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Accessibility</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Product Security</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Contact</a>
|
||
</p>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |