mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-30 15:43:19 +08:00
1210 lines
48 KiB
HTML
1210 lines
48 KiB
HTML
<!DOCTYPE html>
|
|
<html class="writer-html5" lang="en" data-content_root="../">
|
|
<head>
|
|
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
<title>Overview — tensorrt_llm documentation</title>
|
|
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
|
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=19f00094" />
|
|
|
|
|
|
<!--[if lt IE 9]>
|
|
<script src="../_static/js/html5shiv.min.js"></script>
|
|
<![endif]-->
|
|
|
|
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
|
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
|
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
|
<script src="../_static/doctools.js?v=888ff710"></script>
|
|
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
|
<script src="../_static/js/theme.js"></script>
|
|
<link rel="index" title="Index" href="../genindex.html" />
|
|
<link rel="search" title="Search" href="../search.html" />
|
|
<link rel="next" title="Best Practices for Tuning the Performance of TensorRT-LLM" href="perf-best-practices.html" />
|
|
<link rel="prev" title="Expert Parallelism in TensorRT-LLM" href="../advanced/expert-parallelism.html" />
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav">
|
|
<div class="wy-grid-for-nav">
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
|
<div class="wy-side-scroll">
|
|
<div class="wy-side-nav-search" >
|
|
|
|
|
|
|
|
<a href="../index.html" class="icon icon-home">
|
|
tensorrt_llm
|
|
</a>
|
|
<div role="search">
|
|
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
|
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Reference</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/batch-manager.html">The Batch Manager in TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
|
<ul class="current">
|
|
<li class="toctree-l1 current"><a class="current reference internal" href="#">Overview</a><ul>
|
|
<li class="toctree-l2"><a class="reference internal" href="#known-issues">Known Issues</a><ul>
|
|
<li class="toctree-l3"><a class="reference internal" href="#unexpected-extra-gpu-memory-allocation-when-enabling-multiple-profiles">Unexpected extra GPU memory allocation when enabling <code class="docutils literal notranslate"><span class="pre">--multiple_profiles</span></code></a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#fused-matmul-gated-silu-llama">Fused Matmul + Gated-SiLU (LLaMA)</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#throughput-measurements">Throughput Measurements</a></li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#reproducing-benchmarked-results">Reproducing Benchmarked Results</a><ul>
|
|
<li class="toctree-l3"><a class="reference internal" href="#commands">Commands</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#variables">Variables</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#preparing-a-dataset">Preparing a Dataset</a></li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#engine-building">Engine Building</a></li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#running-the-benchmark">Running the Benchmark</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l1"><a class="reference internal" href="perf-best-practices.html">Best Practices for Tuning the Performance of TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="perf-analysis.html">Performance Analysis</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
|
</ul>
|
|
|
|
</div>
|
|
</div>
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="../index.html">tensorrt_llm</a>
|
|
</nav>
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="Page navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
|
<li class="breadcrumb-item active">Overview</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
<a href="../_sources/performance/perf-overview.md.txt" rel="nofollow"> View page source</a>
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
</div>
|
|
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
|
<div itemprop="articleBody">
|
|
|
|
<blockquote id="perf-overview">
|
|
<div><p>[!IMPORTANT]
|
|
As of TensorRT-LLM v0.10, these performance benchmarks have changed methodology to utilize in-flight batching and
|
|
no longer utilize static benchmarking. These numbers are initial measurements and are expected to improve in future
|
|
releases.</p>
|
|
</div></blockquote>
|
|
<section id="overview">
|
|
<h1>Overview<a class="headerlink" href="#overview" title="Link to this heading"></a></h1>
|
|
<p>This document summarizes performance measurements of TensorRT-LLM on H100
|
|
(Hopper), L40S (Ada) and A100 (Ampere) GPUs for a few key models.</p>
|
|
<p>The data in the following tables is provided as a reference point to help users
|
|
validate observed performance. It should not be considered as the peak
|
|
performance that can be delivered by TensorRT-LLM.</p>
|
|
<section id="known-issues">
|
|
<h2>Known Issues<a class="headerlink" href="#known-issues" title="Link to this heading"></a></h2>
|
|
<p>The following issues are being addressed to improve the efficiency of TensorRT-LLM.</p>
|
|
<section id="unexpected-extra-gpu-memory-allocation-when-enabling-multiple-profiles">
|
|
<h3>Unexpected extra GPU memory allocation when enabling <code class="docutils literal notranslate"><span class="pre">--multiple_profiles</span></code><a class="headerlink" href="#unexpected-extra-gpu-memory-allocation-when-enabling-multiple-profiles" title="Link to this heading"></a></h3>
|
|
<p>We observed that enabling multiple profiles can lead to extra
|
|
unexpected GPU memory usage on some cases starting from v0.11.
|
|
The issue will be addressed in future releases.</p>
|
|
</section>
|
|
<section id="fused-matmul-gated-silu-llama">
|
|
<h3>Fused Matmul + Gated-SiLU (LLaMA)<a class="headerlink" href="#fused-matmul-gated-silu-llama" title="Link to this heading"></a></h3>
|
|
<p>The current implementation combines two Matmul operations into one Matmul followed by
|
|
a separate SwiGLU kernel (when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp</span></code> is enabled). There is also a more
|
|
efficient implementation that runs single Matmul + SwiGLU fused kernel for FP8 on Hopper
|
|
(when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp</span> <span class="pre">--gemm_swiglu_plugin</span> <span class="pre">fp8</span></code> is enabled). The gemm_swiglu_plugin
|
|
will support more data types and GPU architectures in the future release.</p>
|
|
</section>
|
|
</section>
|
|
<section id="throughput-measurements">
|
|
<h2>Throughput Measurements<a class="headerlink" href="#throughput-measurements" title="Link to this heading"></a></h2>
|
|
<p>The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
|
|
and shows the throughput client-server scenario under maximum load.</p>
|
|
<p>The performance numbers below were collected using the steps described in this document.</p>
|
|
<p><strong>All data in the table below was generated using version 0.12.0 and presents token throughput in tokens/second.</strong></p>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p><strong>GPU</strong></p></td>
|
|
<td><p>H200 141GB HBM3</p></td>
|
|
<td><p>GH200 120GB</p></td>
|
|
<td><p>H100 80GB HBM3</p></td>
|
|
<td><p>H100 80GB HBM3</p></td>
|
|
<td><p>A100-SXM4-80GB</p></td>
|
|
<td><p>L40S</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p><strong>Precision</strong></p></td>
|
|
<td><p>FP8</p></td>
|
|
<td><p>FP8</p></td>
|
|
<td><p>FP8</p></td>
|
|
<td><p>Mixed</p></td>
|
|
<td><p>Mixed</p></td>
|
|
<td><p>FP8</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p><strong>Model</strong></p></td>
|
|
<td><p><strong>Input/Output Lengths</strong></p></td>
|
|
<td><p><strong>TP</strong></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>GPTJ 6B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>24834.76</p></td>
|
|
<td><p>22454.79</p></td>
|
|
<td><p>24429.55</p></td>
|
|
<td><p>13085.91</p></td>
|
|
<td><p>5864.81</p></td>
|
|
<td><p>7647.24</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>8348.93</p></td>
|
|
<td><p>6656.25</p></td>
|
|
<td><p>7831.38</p></td>
|
|
<td><p>3882.21</p></td>
|
|
<td><p>2194.57</p></td>
|
|
<td><p>1843.91</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>5062.80</p></td>
|
|
<td><p>3678.91</p></td>
|
|
<td><p>3968.98</p></td>
|
|
<td><p>2046.53</p></td>
|
|
<td><p>1118.22</p></td>
|
|
<td><p>980.67</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>2776.53</p></td>
|
|
<td><p>2491.03</p></td>
|
|
<td><p>2724.38</p></td>
|
|
<td><p>1488.56</p></td>
|
|
<td><p>657.01</p></td>
|
|
<td><p>741.06</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>3631.54</p></td>
|
|
<td><p>2994.81</p></td>
|
|
<td><p>3004.17</p></td>
|
|
<td><p>1280.54</p></td>
|
|
<td><p>854.37</p></td>
|
|
<td><p>754.16</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>LLaMA v2 7B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>19706.35</p></td>
|
|
<td><p>17803.58</p></td>
|
|
<td><p>19068.99</p></td>
|
|
<td><p>11393.48</p></td>
|
|
<td><p>5272.39</p></td>
|
|
<td><p>6345.72</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>7651.12</p></td>
|
|
<td><p>5472.34</p></td>
|
|
<td><p>6610.03</p></td>
|
|
<td><p>2964.65</p></td>
|
|
<td><p>1785.79</p></td>
|
|
<td><p>1551.37</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>4424.90</p></td>
|
|
<td><p>3271.61</p></td>
|
|
<td><p>3649.38</p></td>
|
|
<td><p>1596.87</p></td>
|
|
<td><p>957.12</p></td>
|
|
<td><p>817.24</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>2385.54</p></td>
|
|
<td><p>2035.42</p></td>
|
|
<td><p>2271.63</p></td>
|
|
<td><p>1189.06</p></td>
|
|
<td><p>564.77</p></td>
|
|
<td><p>625.09</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>3191.34</p></td>
|
|
<td><p>2726.29</p></td>
|
|
<td><p>2802.41</p></td>
|
|
<td><p>1243.96</p></td>
|
|
<td><p>735.19</p></td>
|
|
<td><p>641.56</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA v3 8B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>28288.75</p></td>
|
|
<td><p>25420.52</p></td>
|
|
<td><p>27399.75</p></td>
|
|
<td><p>15567.44</p></td>
|
|
<td><p>6586.88</p></td>
|
|
<td><p>8745.80</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>23230.62</p></td>
|
|
<td><p>16426.68</p></td>
|
|
<td><p>19198.73</p></td>
|
|
<td><p>8817.39</p></td>
|
|
<td><p>4882.13</p></td>
|
|
<td><p>5084.49</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>16144.44</p></td>
|
|
<td><p>9832.66</p></td>
|
|
<td><p>12084.97</p></td>
|
|
<td><p>5352.37</p></td>
|
|
<td><p>3079.90</p></td>
|
|
<td><p>2755.13</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>3623.79</p></td>
|
|
<td><p>3290.22</p></td>
|
|
<td><p>3463.26</p></td>
|
|
<td><p>1852.48</p></td>
|
|
<td><p>781.63</p></td>
|
|
<td><p>980.86</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>11093.62</p></td>
|
|
<td><p>7573.35</p></td>
|
|
<td><p>8894.11</p></td>
|
|
<td><p>3986.83</p></td>
|
|
<td><p>2268.13</p></td>
|
|
<td><p>2051.79</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>Mistral 7B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>30223.01</p></td>
|
|
<td><p>27696.90</p></td>
|
|
<td><p>29788.46</p></td>
|
|
<td><p>16319.25</p></td>
|
|
<td><p>6807.02</p></td>
|
|
<td><p>9612.58</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>24989.54</p></td>
|
|
<td><p>17942.29</p></td>
|
|
<td><p>20509.72</p></td>
|
|
<td><p>9982.01</p></td>
|
|
<td><p>5296.02</p></td>
|
|
<td><p>5444.89</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>17036.14</p></td>
|
|
<td><p>10846.03</p></td>
|
|
<td><p>12807.80</p></td>
|
|
<td><p>5718.89</p></td>
|
|
<td><p>3241.33</p></td>
|
|
<td><p>2931.17</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>3678.80</p></td>
|
|
<td><p>3294.02</p></td>
|
|
<td><p>3521.71</p></td>
|
|
<td><p>1887.75</p></td>
|
|
<td><p>786.43</p></td>
|
|
<td><p>1002.49</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>11510.54</p></td>
|
|
<td><p>8357.75</p></td>
|
|
<td><p>9214.61</p></td>
|
|
<td><p>4284.82</p></td>
|
|
<td><p>2363.25</p></td>
|
|
<td><p>2154.26</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>Mixtral 8x7B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>24895.03</p></td>
|
|
<td><p>8785.80</p></td>
|
|
<td><p>24394.71</p></td>
|
|
<td><p>15529.86</p></td>
|
|
<td><p>5921.41</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>42014.24</p></td>
|
|
<td><p>38828.53</p></td>
|
|
<td><p>40197.42</p></td>
|
|
<td><p>28132.17</p></td>
|
|
<td><p>11414.95</p></td>
|
|
<td><p>6820.26</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>29389.21</p></td>
|
|
<td><p>5474.69</p></td>
|
|
<td><p>20873.02</p></td>
|
|
<td><p>7066.02</p></td>
|
|
<td><p>4306.98</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>52348.10</p></td>
|
|
<td><p>41573.66</p></td>
|
|
<td><p>40588.05</p></td>
|
|
<td><p>21285.72</p></td>
|
|
<td><p>10974.83</p></td>
|
|
<td><p>7467.15</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>21480.27</p></td>
|
|
<td><p>2277.66</p></td>
|
|
<td><p>12838.28</p></td>
|
|
<td><p>3986.01</p></td>
|
|
<td><p>2400.11</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>39182.04</p></td>
|
|
<td><p>28626.55</p></td>
|
|
<td><p>28337.31</p></td>
|
|
<td><p>12447.13</p></td>
|
|
<td><p>7278.89</p></td>
|
|
<td><p>5233.43</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2934.44</p></td>
|
|
<td><p>1003.51</p></td>
|
|
<td><p>2898.27</p></td>
|
|
<td><p>1834.77</p></td>
|
|
<td><p>693.51</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>5152.40</p></td>
|
|
<td><p>4724.01</p></td>
|
|
<td><p>5028.61</p></td>
|
|
<td><p>3393.18</p></td>
|
|
<td><p>1362.93</p></td>
|
|
<td><p>805.49</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>14029.17</p></td>
|
|
<td><p>2671.88</p></td>
|
|
<td><p>10479.45</p></td>
|
|
<td><p>3531.31</p></td>
|
|
<td><p>1945.88</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>25436.05</p></td>
|
|
<td><p>20302.56</p></td>
|
|
<td><p>19971.72</p></td>
|
|
<td><p>9622.66</p></td>
|
|
<td><p>5221.74</p></td>
|
|
<td><p>3616.30</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA v3 70B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>5386.88</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2959.22</p></td>
|
|
<td><p>1301.14</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>8944.26</p></td>
|
|
<td><p>8587.01</p></td>
|
|
<td><p>8642.05</p></td>
|
|
<td><p>5966.47</p></td>
|
|
<td><p>2413.95</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>16125.20</p></td>
|
|
<td><p></p></td>
|
|
<td><p>15397.47</p></td>
|
|
<td><p>10406.55</p></td>
|
|
<td><p>4548.32</p></td>
|
|
<td><p>1364.08</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>7007.27</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>720.73</p></td>
|
|
<td><p>500.83</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>12906.75</p></td>
|
|
<td><p>10761.53</p></td>
|
|
<td><p>8978.95</p></td>
|
|
<td><p>4736.61</p></td>
|
|
<td><p>2380.02</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>19417.37</p></td>
|
|
<td><p></p></td>
|
|
<td><p>14822.93</p></td>
|
|
<td><p>6672.14</p></td>
|
|
<td><p>3815.08</p></td>
|
|
<td><p>1809.40</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>6183.85</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>369.29</p></td>
|
|
<td><p>251.24</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>8859.54</p></td>
|
|
<td><p>7270.77</p></td>
|
|
<td><p>6073.48</p></td>
|
|
<td><p>2969.99</p></td>
|
|
<td><p>1634.82</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>13969.95</p></td>
|
|
<td><p></p></td>
|
|
<td><p>10094.57</p></td>
|
|
<td><p>4358.77</p></td>
|
|
<td><p>2847.54</p></td>
|
|
<td><p>1313.78</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>696.59</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>301.46</p></td>
|
|
<td><p>140.88</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>1044.35</p></td>
|
|
<td><p>1000.55</p></td>
|
|
<td><p>1022.06</p></td>
|
|
<td><p>681.72</p></td>
|
|
<td><p>278.76</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>2018.47</p></td>
|
|
<td><p></p></td>
|
|
<td><p>1933.15</p></td>
|
|
<td><p>1279.46</p></td>
|
|
<td><p>543.73</p></td>
|
|
<td><p>163.36</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>3525.18</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>87.54</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>6550.76</p></td>
|
|
<td><p>4859.38</p></td>
|
|
<td><p>4870.26</p></td>
|
|
<td><p>2379.66</p></td>
|
|
<td><p>1209.69</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>9706.95</p></td>
|
|
<td><p></p></td>
|
|
<td><p>7670.04</p></td>
|
|
<td><p>3692.41</p></td>
|
|
<td><p>2192.28</p></td>
|
|
<td><p>895.23</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>LLaMA v2 70B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>6355.16</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2927.71</p></td>
|
|
<td><p>1374.05</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>10818.97</p></td>
|
|
<td><p>10819.19</p></td>
|
|
<td><p>10754.99</p></td>
|
|
<td><p>6603.10</p></td>
|
|
<td><p>2765.94</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>16667.25</p></td>
|
|
<td><p></p></td>
|
|
<td><p>16074.84</p></td>
|
|
<td><p>11369.11</p></td>
|
|
<td><p>4796.89</p></td>
|
|
<td><p>1402.92</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>6185.77</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>668.52</p></td>
|
|
<td><p>445.04</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>12884.76</p></td>
|
|
<td><p>11356.48</p></td>
|
|
<td><p>8870.71</p></td>
|
|
<td><p>5067.06</p></td>
|
|
<td><p>2710.53</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>19053.13</p></td>
|
|
<td><p></p></td>
|
|
<td><p>17534.62</p></td>
|
|
<td><p>8805.16</p></td>
|
|
<td><p>5665.93</p></td>
|
|
<td><p>2203.33</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>4873.24</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>334.10</p></td>
|
|
<td><p>215.70</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>8664.90</p></td>
|
|
<td><p>6311.85</p></td>
|
|
<td><p>7564.99</p></td>
|
|
<td><p>3354.02</p></td>
|
|
<td><p>1884.46</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>15110.32</p></td>
|
|
<td><p></p></td>
|
|
<td><p>10584.03</p></td>
|
|
<td><p>5373.10</p></td>
|
|
<td><p>3672.80</p></td>
|
|
<td><p>1787.76</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>732.09</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>302.49</p></td>
|
|
<td><p>141.70</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>1272.90</p></td>
|
|
<td><p>1269.58</p></td>
|
|
<td><p>1265.80</p></td>
|
|
<td><p>774.93</p></td>
|
|
<td><p>320.79</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>2015.77</p></td>
|
|
<td><p></p></td>
|
|
<td><p>1943.96</p></td>
|
|
<td><p>1355.78</p></td>
|
|
<td><p>569.48</p></td>
|
|
<td><p>165.52</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>3508.50</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>321.95</p></td>
|
|
<td><p>212.97</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>6642.69</p></td>
|
|
<td><p>5545.83</p></td>
|
|
<td><p>4889.26</p></td>
|
|
<td><p>2439.10</p></td>
|
|
<td><p>1276.58</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>10178.71</p></td>
|
|
<td><p></p></td>
|
|
<td><p>8071.77</p></td>
|
|
<td><p>4275.74</p></td>
|
|
<td><p>2589.60</p></td>
|
|
<td><p>1083.45</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>Falcon 180B</p></td>
|
|
<td><p>128/128</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>5129.55</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8370.98</p></td>
|
|
<td><p></p></td>
|
|
<td><p>8268.72</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/2048</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>7823.79</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>13278.59</p></td>
|
|
<td><p></p></td>
|
|
<td><p>13107.48</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>128/4096</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>6374.10</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>12660.89</p></td>
|
|
<td><p></p></td>
|
|
<td><p>10493.79</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/128</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>601.67</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>1002.57</p></td>
|
|
<td><p></p></td>
|
|
<td><p>991.22</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2048/2048</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>3869.76</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7134.33</p></td>
|
|
<td><p></p></td>
|
|
<td><p>6386.83</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p><em>TP stands for Tensor Parallelism</em></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
<section id="reproducing-benchmarked-results">
|
|
<h2>Reproducing Benchmarked Results<a class="headerlink" href="#reproducing-benchmarked-results" title="Link to this heading"></a></h2>
|
|
<blockquote>
|
|
<div><p>[!NOTE] The only models supported in this workflow are those listed in the table above.</p>
|
|
</div></blockquote>
|
|
<p>The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
|
|
description of this benchmarking workflow, see the <span class="xref myst">Benchmarking Suite README</span>.</p>
|
|
<section id="commands">
|
|
<h3>Commands<a class="headerlink" href="#commands" title="Link to this heading"></a></h3>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head text-left"><p>Stage</p></th>
|
|
<th class="head"><p>Description</p></th>
|
|
<th class="head"><p>Command</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#preparing-a-dataset"><span class="xref myst">Dataset</span></a></p></td>
|
|
<td><p>Create a synthetic dataset</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">benchmarks/cpp/prepare_dataset.py</span> <span class="pre">--tokenizer=$model_name</span> <span class="pre">--stdout</span> <span class="pre">token-norm-dist</span> <span class="pre">--num-requests=$num_requests</span> <span class="pre">--input-mean=$isl</span> <span class="pre">--output-mean=$osl</span> <span class="pre">--input-stdev=0</span> <span class="pre">--output-stdev=0</span> <span class="pre">></span> <span class="pre">$dataset_file</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><a class="reference internal" href="#engine-building"><span class="xref myst">Build</span></a></p></td>
|
|
<td><p>Build a TensorRT-LLM engine</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">--model</span> <span class="pre">$model_name</span> <span class="pre">build</span> <span class="pre">--tp_size</span> <span class="pre">$tp_size</span> <span class="pre">--quantization</span> <span class="pre">FP8</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#running-the-benchmark"><span class="xref myst">Run</span></a></p></td>
|
|
<td><p>Run a benchmark with a dataset</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">--model</span> <span class="pre">$model_name</span> <span class="pre">throughput</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span> <span class="pre">--engine_dir</span> <span class="pre">$engine_dir</span></code></p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
<section id="variables">
|
|
<h3>Variables<a class="headerlink" href="#variables" title="Link to this heading"></a></h3>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head text-left"><p>Name</p></th>
|
|
<th class="head"><p>Description</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$isl</span></code></p></td>
|
|
<td><p>Benchmark input sequence length.</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$osl</span></code></p></td>
|
|
<td><p>Benchmark output sequence length.</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></td>
|
|
<td><p>Number of GPUs to run the benchmark with</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$engine_dir</span></code></p></td>
|
|
<td><p>Location to store built engine file (can be deleted after running benchmarks).</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$model_name</span></code></p></td>
|
|
<td><p>HuggingFace model name eg. meta-llama/Llama-2-7b-hf or use the path to a local weights directory</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code></p></td>
|
|
<td><p>Location of the dataset file generated by <code class="docutils literal notranslate"><span class="pre">prepare_dataset.py</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$num_requests</span></code></p></td>
|
|
<td><p>The number of requests to generate for dataset generation</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$seq_len</span></code></p></td>
|
|
<td><p>A sequence length of ISL + OSL</p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
</section>
|
|
<section id="preparing-a-dataset">
|
|
<h2>Preparing a Dataset<a class="headerlink" href="#preparing-a-dataset" title="Link to this heading"></a></h2>
|
|
<p>In order to prepare a dataset, you can use the provided <a class="reference download internal" download="" href="../_downloads/ea8faa5e98124e92f96b66dc586fb429/prepare_dataset.py"><span class="xref download myst">script</span></a>.
|
|
To generate a synthetic dataset, run the following command:</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>benchmarks/cpp/prepare_dataset.py<span class="w"> </span>--output<span class="o">=</span><span class="nv">$dataset_file</span><span class="w"> </span>--tokenizer<span class="o">=</span><span class="nv">$model_name</span><span class="w"> </span>token-norm-dist<span class="w"> </span>--num-requests<span class="o">=</span><span class="nv">$num_requests</span><span class="w"> </span>--input-mean<span class="o">=</span><span class="nv">$isl</span><span class="w"> </span>--output-mean<span class="o">=</span><span class="nv">$osl</span><span class="w"> </span>--input-stdev<span class="o">=</span><span class="m">0</span><span class="w"> </span>--output-stdev<span class="o">=</span><span class="m">0</span><span class="w"> </span>><span class="w"> </span><span class="nv">$dataset_file</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The command will generate a text file located at the path specified <code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code> where all requests are of the same
|
|
input/output sequence length combinations. The script works by using the tokenizer to retrieve the vocabulary size and
|
|
randomly sample token IDs from it to create entirely random sequences. In the command above, all requests will be uniform
|
|
because the standard deviations for both input and output sequences are set to 0.</p>
|
|
<p>For each input and output sequence length combination, the table below details the <code class="docutils literal notranslate"><span class="pre">$num_requests</span></code> that were used. For
|
|
shorter input and output lengths, a larger number of messages were used to guarantee that the system hit a steady state
|
|
because requests enter and exit the system at a much faster rate. For longer input/output sequence lengths, requests
|
|
remain in the system longer and therefore require less requests to achieve steady state.</p>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head"><p>Input Length</p></th>
|
|
<th class="head"><p>Output Length</p></th>
|
|
<th class="head"><p>$seq_len</p></th>
|
|
<th class="head"><p>$num_requests</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td><p>128</p></td>
|
|
<td><p>128</p></td>
|
|
<td><p>256</p></td>
|
|
<td><p>30000</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>128</p></td>
|
|
<td><p>2048</p></td>
|
|
<td><p>2176</p></td>
|
|
<td><p>3000</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>128</p></td>
|
|
<td><p>4096</p></td>
|
|
<td><p>4224</p></td>
|
|
<td><p>1500</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>2048</p></td>
|
|
<td><p>128</p></td>
|
|
<td><p>2176</p></td>
|
|
<td><p>3000</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>2048</p></td>
|
|
<td><p>2048</p></td>
|
|
<td><p>4096</p></td>
|
|
<td><p>1500</p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
<section id="engine-building">
|
|
<h2>Engine Building<a class="headerlink" href="#engine-building" title="Link to this heading"></a></h2>
|
|
<p>All engines are built using the <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">build</span></code> sub-command. The basic command for FP8 quantized engines is as follows:</p>
|
|
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>trtllm-bench --model $model_name build --tp_size $tp_size --quantization FP8 --dataset $dataset_file
|
|
</pre></div>
|
|
</div>
|
|
<p>or if you would like to build for a specific sequence length:</p>
|
|
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>trtllm-bench --model $model_name build --tp_size $tp_size --quantization FP8 --max_seq_length $seq_len
|
|
</pre></div>
|
|
</div>
|
|
<p>If you would like to build an FP16 engine without any quantization, simply remove the <code class="docutils literal notranslate"><span class="pre">--quantization</span> <span class="pre">FP8</span></code> option.</p>
|
|
<blockquote>
|
|
<div><p>[!NOTE] If you specify FP8 quantization, the KV cache will automatically be set to FP8 as well!</p>
|
|
</div></blockquote>
|
|
<p>The <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">build</span></code> sub-command will output the path where the engine is located upon a successful build. For example,</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="o">===========================================================</span>
|
|
ENGINE<span class="w"> </span>SAVED:<span class="w"> </span>/tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
|
<span class="o">===========================================================</span>
|
|
</pre></div>
|
|
</div>
|
|
</section>
|
|
<section id="running-the-benchmark">
|
|
<h2>Running the Benchmark<a class="headerlink" href="#running-the-benchmark" title="Link to this heading"></a></h2>
|
|
<p>To run the benchmark with the generated data set, simply use the <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">throughput</span></code> sub-command. The benchmarker will
|
|
run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide
|
|
the patch to the engine from the <a class="reference internal" href="#engine-building"><span class="xref myst">build</span></a> phase and a <a class="reference internal" href="#preparing-a-dataset"><span class="xref myst">generated dataset</span></a>.</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>trtllm-bench<span class="w"> </span>--model<span class="w"> </span><span class="nv">$model_name</span><span class="w"> </span>throughput<span class="w"> </span>--dataset<span class="w"> </span><span class="nv">$dataset_file</span><span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_dir</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The results will be printed to the terminal upon benchmark completion. For example,</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="o">===========================================================</span>
|
|
<span class="o">=</span><span class="w"> </span>ENGINE<span class="w"> </span><span class="nv">DETAILS</span>
|
|
<span class="o">===========================================================</span>
|
|
Model:<span class="w"> </span>meta-llama/Llama-2-7b-hf
|
|
Engine<span class="w"> </span>Directory:<span class="w"> </span>/tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
|
TensorRT-LLM<span class="w"> </span>Version:<span class="w"> </span><span class="m">0</span>.12.0
|
|
Dtype:<span class="w"> </span>float16
|
|
KV<span class="w"> </span>Cache<span class="w"> </span>Dtype:<span class="w"> </span>FP8
|
|
Quantization:<span class="w"> </span>FP8
|
|
Max<span class="w"> </span>Input<span class="w"> </span>Length:<span class="w"> </span><span class="m">2048</span>
|
|
Max<span class="w"> </span>Sequence<span class="w"> </span>Length:<span class="w"> </span><span class="nv">4098</span>
|
|
|
|
<span class="o">===========================================================</span>
|
|
<span class="o">=</span><span class="w"> </span>WORLD<span class="w"> </span>+<span class="w"> </span>RUNTIME<span class="w"> </span><span class="nv">INFORMATION</span>
|
|
<span class="o">===========================================================</span>
|
|
TP<span class="w"> </span>Size:<span class="w"> </span><span class="m">1</span>
|
|
PP<span class="w"> </span>Size:<span class="w"> </span><span class="m">1</span>
|
|
Max<span class="w"> </span>Runtime<span class="w"> </span>Batch<span class="w"> </span>Size:<span class="w"> </span><span class="m">4096</span>
|
|
Max<span class="w"> </span>Runtime<span class="w"> </span>Tokens:<span class="w"> </span><span class="m">8192</span>
|
|
Scheduling<span class="w"> </span>Policy:<span class="w"> </span>Guaranteed<span class="w"> </span>No<span class="w"> </span>Evict
|
|
KV<span class="w"> </span>Memory<span class="w"> </span>Percentage:<span class="w"> </span><span class="m">99</span>.0%
|
|
Issue<span class="w"> </span>Rate<span class="w"> </span><span class="o">(</span>req/sec<span class="o">)</span>:<span class="w"> </span><span class="m">3</span>.680275266452667e+18
|
|
<span class="o">===========================================================</span>
|
|
<span class="o">=</span><span class="w"> </span><span class="nv">STATISTICS</span>
|
|
<span class="o">===========================================================</span>
|
|
Number<span class="w"> </span>of<span class="w"> </span>requests:<span class="w"> </span><span class="m">3000</span>
|
|
Average<span class="w"> </span>Input<span class="w"> </span>Length<span class="w"> </span><span class="o">(</span>tokens<span class="o">)</span>:<span class="w"> </span><span class="m">128</span>.0
|
|
Average<span class="w"> </span>Output<span class="w"> </span>Length<span class="w"> </span><span class="o">(</span>tokens<span class="o">)</span>:<span class="w"> </span><span class="m">128</span>.0
|
|
Token<span class="w"> </span>Throughput<span class="w"> </span><span class="o">(</span>tokens/sec<span class="o">)</span>:<span class="w"> </span><span class="m">23405</span>.927228471104
|
|
Request<span class="w"> </span>Throughput<span class="w"> </span><span class="o">(</span>req/sec<span class="o">)</span>:<span class="w"> </span><span class="m">182</span>.8588064724305
|
|
Total<span class="w"> </span>Latency<span class="w"> </span><span class="o">(</span>seconds<span class="o">)</span>:<span class="w"> </span><span class="m">16</span>.406100739
|
|
<span class="o">===========================================================</span>
|
|
</pre></div>
|
|
</div>
|
|
<blockquote>
|
|
<div><p>[!WARNING] In some cases, the benchmarker may not print anything at all. This behavior usually
|
|
means that the benchmark has hit an out of memory issue. Try reducing the KV cache percentage
|
|
using the <code class="docutils literal notranslate"><span class="pre">--kv_cache_free_gpu_mem_fraction</span></code> option to lower the percentage of used memory.</p>
|
|
</div></blockquote>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
|
<a href="../advanced/expert-parallelism.html" class="btn btn-neutral float-left" title="Expert Parallelism in TensorRT-LLM" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
|
<a href="perf-best-practices.html" class="btn btn-neutral float-right" title="Best Practices for Tuning the Performance of TensorRT-LLM" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
|
</div>
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<jinja2.runtime.BlockReference object at 0x7f4253467580>
|
|
|
|
<div class="footer">
|
|
<p>
|
|
Copyright © 2024 NVIDIA Corporation
|
|
</p>
|
|
<p>
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Privacy Policy</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Manage My Privacy</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
|
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Accessibility</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
|
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Product Security</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Contact</a>
|
|
</p>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</footer>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
</div>
|
|
<script>
|
|
jQuery(function () {
|
|
SphinxRtdTheme.Navigation.enable(true);
|
|
});
|
|
</script>
|
|
|
|
</body>
|
|
</html> |