mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-25 05:02:59 +08:00
1444 lines
87 KiB
HTML
1444 lines
87 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="../">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>Overview — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=19f00094" />
|
||
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="../_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="Best Practices for Tuning the Performance of TensorRT-LLM" href="perf-best-practices.html" />
|
||
<link rel="prev" title="Expert Parallelism in TensorRT-LLM" href="../advanced/expert-parallelism.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="../index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/batch-manager.html">The Batch Manager in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Overview</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#known-issues">Known Issues</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#fused-matmul-gated-silu-llama">Fused Matmul + Gated-SiLU (LLaMA)</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#throughput-measurements">Throughput Measurements</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#reproducing-benchmarked-results">Reproducing Benchmarked Results</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#building-the-tensorrt-llm-container">Building the TensorRT-LLM Container</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#methodology">Methodology</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#commands">Commands</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#variables">Variables</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#engine-building">Engine Building</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#engine-configuration-files">Engine Configuration Files</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#network-configuration-files-and-settings">Network Configuration Files and Settings</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#running-on-a100">Running on A100</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#preparing-a-dataset">Preparing a Dataset</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#running-the-benchmark">Running the Benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="perf-best-practices.html">Best Practices for Tuning the Performance of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="perf-analysis.html">Performance Analysis</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="../index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">Overview</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="../_sources/performance/perf-overview.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<blockquote id="perf-overview">
|
||
<div><p>[!IMPORTANT]
|
||
As of TensorRT-LLM v0.10, these performance benchmarks have changed methodology to utilize in-flight batching and
|
||
no longer utilize static benchmarking. These numbers are initial measurements and are expected to improve in future
|
||
releases.</p>
|
||
</div></blockquote>
|
||
<section id="overview">
|
||
<h1>Overview<a class="headerlink" href="#overview" title="Link to this heading"></a></h1>
|
||
<p>This document summarizes performance measurements of TensorRT-LLM on H100
|
||
(Hopper), L40S (Ada) and A100 (Ampere) GPUs for a few key models.</p>
|
||
<p>The data in the following tables is provided as a reference point to help users
|
||
validate observed performance. It should not be considered as the peak
|
||
performance that can be delivered by TensorRT-LLM.</p>
|
||
<section id="known-issues">
|
||
<h2>Known Issues<a class="headerlink" href="#known-issues" title="Link to this heading"></a></h2>
|
||
<p>The following issues are being addressed to improve the efficiency of TensorRT-LLM.</p>
|
||
<section id="fused-matmul-gated-silu-llama">
|
||
<h3>Fused Matmul + Gated-SiLU (LLaMA)<a class="headerlink" href="#fused-matmul-gated-silu-llama" title="Link to this heading"></a></h3>
|
||
<p>The current implementation combines two Matmul operations into one Matmul followed by
|
||
a separate SwiGLU kernel (when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp</span></code> is enabled). There is also a more
|
||
efficient implementation that runs single Matmul + SwiGLU fused kernel for FP8 on Hopper
|
||
(when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp</span> <span class="pre">--gemm_swiglu_plugin</span> <span class="pre">fp8</span></code> is enabled). The gemm_swiglu_plugin
|
||
will support more data types and GPU architectures in the future release.</p>
|
||
</section>
|
||
</section>
|
||
<section id="throughput-measurements">
|
||
<h2>Throughput Measurements<a class="headerlink" href="#throughput-measurements" title="Link to this heading"></a></h2>
|
||
<p>The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
|
||
and shows the throughput client-server scenario under maximum load.</p>
|
||
<p>The performance numbers below were collected using the steps described in this document.</p>
|
||
<p><strong>All data in the table below was generated using version 0.11.0 and presents token throughput in tokens/second.</strong></p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
<th class="head"><p></p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p><strong>GPU</strong></p></td>
|
||
<td><p>H200 141GB HBM3</p></td>
|
||
<td><p>GH200 120GB</p></td>
|
||
<td><p>H100 80GB HBM3</p></td>
|
||
<td><p>H100 80GB HBM3</p></td>
|
||
<td><p>A100-SXM4-80GB</p></td>
|
||
<td><p>L40S</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p><strong>Precision</strong></p></td>
|
||
<td><p>FP8</p></td>
|
||
<td><p>FP8</p></td>
|
||
<td><p>FP8</p></td>
|
||
<td><p>FP16</p></td>
|
||
<td><p>FP16</p></td>
|
||
<td><p>FP8</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p><strong>Model</strong></p></td>
|
||
<td><p><strong>Input/Output Lengths</strong></p></td>
|
||
<td><p><strong>TP</strong></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>GPTJ 6B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>25116.54</p></td>
|
||
<td><p>24998.09</p></td>
|
||
<td><p>24456.84</p></td>
|
||
<td><p>13328.96</p></td>
|
||
<td><p>6168.8</p></td>
|
||
<td><p>7737.44</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>2845.4</p></td>
|
||
<td><p>2840.46</p></td>
|
||
<td><p>2781.11</p></td>
|
||
<td><p>1410.81</p></td>
|
||
<td><p>662</p></td>
|
||
<td><p>83.46</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>8165.91</p></td>
|
||
<td><p>7936.16</p></td>
|
||
<td><p>7643.02</p></td>
|
||
<td><p>3503.41</p></td>
|
||
<td><p>2213.44</p></td>
|
||
<td><p>1927.91</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3560.37</p></td>
|
||
<td><p>3197.21</p></td>
|
||
<td><p>3081.26</p></td>
|
||
<td><p>1326.79</p></td>
|
||
<td><p>893.43</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v2 7B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>19695.41</p></td>
|
||
<td><p>19509.49</p></td>
|
||
<td><p>17684.88</p></td>
|
||
<td><p>11605.69</p></td>
|
||
<td><p>5286.1</p></td>
|
||
<td><p>6655.52</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>2471.89</p></td>
|
||
<td><p>2401.29</p></td>
|
||
<td><p>2342.71</p></td>
|
||
<td><p>1173.81</p></td>
|
||
<td><p>558.56</p></td>
|
||
<td><p>644.72</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>7867.28</p></td>
|
||
<td><p>6689.51</p></td>
|
||
<td><p>6814.72</p></td>
|
||
<td><p>3074.4</p></td>
|
||
<td><p>1813.79</p></td>
|
||
<td><p>1591.51</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3215.63</p></td>
|
||
<td><p>3015.84</p></td>
|
||
<td><p>2820.31</p></td>
|
||
<td><p>1289.87</p></td>
|
||
<td><p>716.55</p></td>
|
||
<td><p>653.19</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v3 8B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>29084.05</p></td>
|
||
<td><p>29197.48</p></td>
|
||
<td><p>27781.28</p></td>
|
||
<td><p>15225.75</p></td>
|
||
<td><p>6450.88</p></td>
|
||
<td><p>8929.6</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3699.64</p></td>
|
||
<td><p>3780.47</p></td>
|
||
<td><p>3555.57</p></td>
|
||
<td><p>1844.38</p></td>
|
||
<td><p>775.18</p></td>
|
||
<td><p>1052.3</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>23723.81</p></td>
|
||
<td><p>22055.94</p></td>
|
||
<td><p>17894.85</p></td>
|
||
<td><p>8415.67</p></td>
|
||
<td><p>4837.47</p></td>
|
||
<td><p>4497.21</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>11193.29</p></td>
|
||
<td><p>8877.13</p></td>
|
||
<td><p>8398.71</p></td>
|
||
<td><p>3996.93</p></td>
|
||
<td><p>2271.65</p></td>
|
||
<td><p>1911.63</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>Mistral 7B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>31618.59</p></td>
|
||
<td><p>31868.45</p></td>
|
||
<td><p>30400.21</p></td>
|
||
<td><p>16108.11</p></td>
|
||
<td><p>6749.91</p></td>
|
||
<td><p>10237.23</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>3791.1</p></td>
|
||
<td><p>3795.27</p></td>
|
||
<td><p>3618.11</p></td>
|
||
<td><p>1896.76</p></td>
|
||
<td><p>783.94</p></td>
|
||
<td><p>1126.08</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>25646.02</p></td>
|
||
<td><p>20491.88</p></td>
|
||
<td><p>20518.75</p></td>
|
||
<td><p>10018.54</p></td>
|
||
<td><p>5358.28</p></td>
|
||
<td><p>5441.98</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>1</p></td>
|
||
<td><p>12068.11</p></td>
|
||
<td><p>9462.96</p></td>
|
||
<td><p>9504.59</p></td>
|
||
<td><p>4383.42</p></td>
|
||
<td><p>2465.77</p></td>
|
||
<td><p>2213.69</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v2 70B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>6652.29</p></td>
|
||
<td><p>5619.41</p></td>
|
||
<td><p>6502.44</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>10921.65</p></td>
|
||
<td><p>11043</p></td>
|
||
<td><p>10448.46</p></td>
|
||
<td><p>6219.11</p></td>
|
||
<td><p>2487.78</p></td>
|
||
<td><p>1549.09</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>15878.34</p></td>
|
||
<td><p></p></td>
|
||
<td><p>14781.66</p></td>
|
||
<td><p>10093.27</p></td>
|
||
<td><p>4233.24</p></td>
|
||
<td><p>1497.68</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>766.38</p></td>
|
||
<td><p>647.73</p></td>
|
||
<td><p>747.14</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>1296.75</p></td>
|
||
<td><p>1298.94</p></td>
|
||
<td><p>1231.26</p></td>
|
||
<td><p>714.07</p></td>
|
||
<td><p>285.9</p></td>
|
||
<td><p>179.19</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>1930.16</p></td>
|
||
<td><p></p></td>
|
||
<td><p>1808.02</p></td>
|
||
<td><p>1230.66</p></td>
|
||
<td><p>494.29</p></td>
|
||
<td><p>176.24</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>7014.86</p></td>
|
||
<td><p>4844.17</p></td>
|
||
<td><p>5267.56</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>13365.86</p></td>
|
||
<td><p>11596.55</p></td>
|
||
<td><p>9202.42</p></td>
|
||
<td><p>3787.24</p></td>
|
||
<td><p>2267.02</p></td>
|
||
<td><p>1772.45</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>18861.53</p></td>
|
||
<td><p></p></td>
|
||
<td><p>17085.82</p></td>
|
||
<td><p>7846.64</p></td>
|
||
<td><p>5096.52</p></td>
|
||
<td><p>2290.99</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>3554.71</p></td>
|
||
<td><p>2843.31</p></td>
|
||
<td><p>2457.73</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>6604.37</p></td>
|
||
<td><p>5969.11</p></td>
|
||
<td><p>4586.99</p></td>
|
||
<td><p>1994.1</p></td>
|
||
<td><p>1137.22</p></td>
|
||
<td><p>890.83</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>10034.12</p></td>
|
||
<td><p></p></td>
|
||
<td><p>7647.54</p></td>
|
||
<td><p>4347.09</p></td>
|
||
<td><p>2152.35</p></td>
|
||
<td><p>1130.36</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>LLaMA v3 70B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p>9872.81</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>15255</p></td>
|
||
<td><p></p></td>
|
||
<td><p>13853.05</p></td>
|
||
<td><p></p></td>
|
||
<td><p>4033.42</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p>1284.88</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>1918.47</p></td>
|
||
<td><p></p></td>
|
||
<td><p>1738.94</p></td>
|
||
<td><p></p></td>
|
||
<td><p>476.42</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p>9996.88</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>19071.39</p></td>
|
||
<td><p></p></td>
|
||
<td><p>10887.34</p></td>
|
||
<td><p></p></td>
|
||
<td><p>3373.71</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p></p></td>
|
||
<td><p>4985.31</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>9387.81</p></td>
|
||
<td><p></p></td>
|
||
<td><p>6029.39</p></td>
|
||
<td><p></p></td>
|
||
<td><p>1824.06</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>Mixtral 8x7B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>26317.73</p></td>
|
||
<td><p>21768.19</p></td>
|
||
<td><p>24770.44</p></td>
|
||
<td><p>11821.14</p></td>
|
||
<td><p>5522.43</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>3181.76</p></td>
|
||
<td><p>2545.52</p></td>
|
||
<td><p>2973.11</p></td>
|
||
<td><p>1391.28</p></td>
|
||
<td><p>636.77</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>30105.61</p></td>
|
||
<td><p>23643.33</p></td>
|
||
<td><p>22120.85</p></td>
|
||
<td><p>6337.02</p></td>
|
||
<td><p>3698.23</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>2</p></td>
|
||
<td><p>15002.42</p></td>
|
||
<td><p>11683.11</p></td>
|
||
<td><p>11486.66</p></td>
|
||
<td><p>3024.95</p></td>
|
||
<td><p>1710.53</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>Falcon 180B</p></td>
|
||
<td><p>128/128</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>5647.01</p></td>
|
||
<td><p></p></td>
|
||
<td><p>5568.91</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>9304.06</p></td>
|
||
<td><p></p></td>
|
||
<td><p>8885.39</p></td>
|
||
<td><p></p></td>
|
||
<td><p>2171.78</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/128</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>670.99</p></td>
|
||
<td><p>693.82</p></td>
|
||
<td><p>667.8</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>1103.18</p></td>
|
||
<td><p></p></td>
|
||
<td><p>1065.16</p></td>
|
||
<td><p></p></td>
|
||
<td><p>238.61</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>128/2048</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>8358.01</p></td>
|
||
<td><p>6655.38</p></td>
|
||
<td><p>6376.89</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>14514.24</p></td>
|
||
<td><p></p></td>
|
||
<td><p>12447.25</p></td>
|
||
<td><p></p></td>
|
||
<td><p>2657.9</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p></p></td>
|
||
<td><p>2048/2048</p></td>
|
||
<td><p>4</p></td>
|
||
<td><p>4169.39</p></td>
|
||
<td><p>3415.05</p></td>
|
||
<td><p>3412.09</p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p></p></td>
|
||
<td><p></p></td>
|
||
<td><p>8</p></td>
|
||
<td><p>7524.11</p></td>
|
||
<td><p></p></td>
|
||
<td><p>6326.46</p></td>
|
||
<td><p></p></td>
|
||
<td><p>1392.31</p></td>
|
||
<td><p></p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><em>TP stands for Tensor Parallelism</em></p>
|
||
</section>
|
||
<section id="reproducing-benchmarked-results">
|
||
<h2>Reproducing Benchmarked Results<a class="headerlink" href="#reproducing-benchmarked-results" title="Link to this heading"></a></h2>
|
||
<section id="building-the-tensorrt-llm-container">
|
||
<h3>Building the TensorRT-LLM Container<a class="headerlink" href="#building-the-tensorrt-llm-container" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>In order to benchmark TensorRT-LLM, you will need to follow the <span class="xref myst">Quick Start</span>
|
||
build process to create a baseline container for building a wheel. Additionally, the development
|
||
container needs a copy of the source code to build the wheel and the benchmarking script. Create the
|
||
right build environment, use the following :</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/NVIDIA/TensorRT-LLM.git
|
||
<span class="nb">cd</span><span class="w"> </span>TensorRT-LLM
|
||
git<span class="w"> </span>submodule<span class="w"> </span>update<span class="w"> </span>--init<span class="w"> </span>--recursive
|
||
git<span class="w"> </span>lfs<span class="w"> </span>install
|
||
git<span class="w"> </span>lfs<span class="w"> </span>pull
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>build
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span>
|
||
</pre></div>
|
||
</div>
|
||
<blockquote>
|
||
<div><p>[!WARNING]
|
||
If you have elevated privileges on your system, then skip the <code class="docutils literal notranslate"><span class="pre">make</span> <span class="pre">-C</span> <span class="pre">docker</span> <span class="pre">run</span> <span class="pre">LOCAL_USER=1</span></code>
|
||
command above as it may make it so that you cannot access some required system libraries within the
|
||
container because the build forces your UID and GID to match those that are set for your non-elevated
|
||
user. There are cases where the container will be booted as root (i.e. on some SLURM systems with
|
||
the pyxis plugin) which will cause libraries to be missing.</p>
|
||
</div></blockquote>
|
||
<p>If you are benchmarking in a shared environment, you need to specify the GPU indices that you would
|
||
like the container to use, otherwise the Makefile defaults to loading the container with all GPUs on
|
||
the system. For example, if you only have the 4 higher indices of GPUs on your system you can
|
||
configure it using the following example:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">NV_GPU</span><span class="o">=</span><span class="m">0</span>,1,2,3
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">GPU_OPTS</span><span class="o">=</span><span class="s1">'--gpus \"device=${NV_GPU}\"'</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Additionally, if you’d like to mount external storage to access persistent storage, or previously
|
||
built engines, you can mount directories as follows (simply replace <code class="docutils literal notranslate"><span class="pre">source</span></code> and <code class="docutils literal notranslate"><span class="pre">destination</span></code> with
|
||
the appropriate paths):</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">DOCKER_RUN_ARGS</span><span class="o">=</span><span class="s2">"-v /source:/destination"</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Once the container starts, you’ll need to build the wheel and the benchmarking scripts. From the
|
||
code root (the default directory when the container is loaded), the following commands will build
|
||
the TensorRT-LLM wheel, install dependencies, and build the benchmark scripts:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>python3<span class="w"> </span>./scripts/build_wheel.py<span class="w"> </span>--benchmarks<span class="w"> </span>--trt_root<span class="w"> </span>/usr/local/tensorrt
|
||
pip<span class="w"> </span>install<span class="w"> </span>./build/tensorrt_llm*.whl
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="methodology">
|
||
<h2>Methodology<a class="headerlink" href="#methodology" title="Link to this heading"></a></h2>
|
||
<p>The following tables are references for commands that are used as part of the benchmarking process.</p>
|
||
<section id="commands">
|
||
<h3>Commands<a class="headerlink" href="#commands" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Stage</p></th>
|
||
<th class="head"><p>Description</p></th>
|
||
<th class="head"><p>Command</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#engine-building"><span class="xref myst">Build</span></a></p></td>
|
||
<td><p>Build a TensorRT-LLM engine</p></td>
|
||
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-build</span> <span class="pre">--model_config</span> <span class="pre">$model_cfg</span> <span class="pre">--use_fused_mlp</span> <span class="pre">--gpt_attention_plugin</span> <span class="pre">float16</span> <span class="pre">--output_dir</span> <span class="pre">$engine_dir</span> <span class="pre">--max_batch_size</span> <span class="pre">$max_batch_size</span> <span class="pre">--max_input_len</span> <span class="pre">2048</span> <span class="pre">--max_output_len</span> <span class="pre">2048</span> <span class="pre">--reduce_fusion</span> <span class="pre">disable</span> <span class="pre">--workers</span> <span class="pre">$tp_size</span> <span class="pre">--max_num_tokens</span> <span class="pre">$max_num_tokens</span> <span class="pre">--use_paged_context_fmha</span> <span class="pre">enable</span> <span class="pre">--multiple_profiles</span> <span class="pre">enable</span></code></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><a class="reference internal" href="#preparing-a-dataset"><span class="xref myst">Dataset</span></a></p></td>
|
||
<td><p>Create a synthetic dataset</p></td>
|
||
<td><p><code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/prepare_dataset.py</span> <span class="pre">--output=$dataset_file</span> <span class="pre">--tokenizer=$model_name</span> <span class="pre">token-norm-dist</span> <span class="pre">--num-requests=2000</span> <span class="pre">--input-mean=$isl</span> <span class="pre">--output-mean=$osl</span> <span class="pre">--input-stdev=0</span> <span class="pre">--output-stdev=0</span></code></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#running-the-benchmark"><span class="xref myst">Run</span></a></p></td>
|
||
<td><p>Run a benchmark with a dataset</p></td>
|
||
<td><p><code class="docutils literal notranslate"><span class="pre">mpirun</span> <span class="pre">-n</span> <span class="pre">$tp_size</span> <span class="pre">--allow-run-as-root</span> <span class="pre">--oversubscribe</span> <span class="pre">cpp/build/benchmarks/gptManagerBenchmark</span> <span class="pre">--engine_dir</span> <span class="pre">$engine_dir</span> <span class="pre">--type</span> <span class="pre">IFB</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span> <span class="pre">--eos_id</span> <span class="pre">-1</span> <span class="pre">--scheduler_policy</span> <span class="pre">guaranteed_no_evict</span> <span class="pre">--kv_cache_free_gpu_mem_fraction</span> <span class="pre">0.99</span> <span class="pre">--output_csv</span> <span class="pre">result.csv</span> <span class="pre">--request_rate</span> <span class="pre">-1.0</span> <span class="pre">--enable_chunked_context</span> <span class="pre">--warm_up</span> <span class="pre">0</span></code></p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="variables">
|
||
<h3>Variables<a class="headerlink" href="#variables" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Name</p></th>
|
||
<th class="head"><p>Description</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$isl</span></code></p></td>
|
||
<td><p>Benchmark input sequence length.</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$osl</span></code></p></td>
|
||
<td><p>Benchmark output sequence length.</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></td>
|
||
<td><p>Number of GPUs to run the benchmark with</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$engine_dir</span></code></p></td>
|
||
<td><p>Location to store built engine file (can be deleted after running benchmarks).</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$model_cfg</span></code></p></td>
|
||
<td><p>Name of the model configuration JSON file to use for building.</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$model_name</span></code></p></td>
|
||
<td><p>HuggingFace model name eg. meta-llama/Llama-2-7b-hf or use the path to a local weights directory</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code></p></td>
|
||
<td><p>Location of the dataset file generated by <code class="docutils literal notranslate"><span class="pre">prepare_dataset.py</span></code></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$results_csv</span></code></p></td>
|
||
<td><p>Path to store end results to.</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></td>
|
||
<td><p>Absolute maximum number of concurrent requests an engine can handle during one iteration.</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></td>
|
||
<td><p>Maximum number of total tokens an engine can handle during one iteration.</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="engine-building">
|
||
<h3>Engine Building<a class="headerlink" href="#engine-building" title="Link to this heading"></a></h3>
|
||
<p>All benchmarks were run using a single engine with a configuration that is capable of handling the
|
||
maximum sequence lengths encountered during benchmarking. For each benchmark, regardless of input/output
|
||
sequence length, you can reuse the single engine to run all tests. Each engine will be built with a paged
|
||
KV cache and in-flight batching enabled. For more information see the
|
||
<a class="reference internal" href="../overview.html#in-flight-batching-and-paged-attention"><span class="std std-ref">documentation about in-flight batching</span></a>.</p>
|
||
<p>In order to build an engine you will need to run the following command by specifying a configuration file
|
||
for the model that you would like to build (see <a class="reference internal" href="#network-configuration-files"><span class="xref myst">below</span></a>). The general build
|
||
command is as follows:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>trtllm-build<span class="w"> </span>--model_config<span class="w"> </span><span class="nv">$model_cfg</span><span class="w"> </span>--use_fused_mlp<span class="w"> </span>--gpt_attention_plugin<span class="w"> </span>float16<span class="w"> </span>--output_dir<span class="w"> </span><span class="nv">$engine_dir</span><span class="w"> </span>--max_batch_size<span class="w"> </span><span class="nv">$max_batch_size</span><span class="w"> </span>--max_input_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span>--max_output_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span>--reduce_fusion<span class="w"> </span>disable<span class="w"> </span>--workers<span class="w"> </span><span class="nv">$tp_size</span><span class="w"> </span>--max_num_tokens<span class="w"> </span><span class="nv">$max_num_tokens</span><span class="w"> </span>--use_paged_context_fmha<span class="w"> </span><span class="nb">enable</span><span class="w"> </span>--multiple_profiles<span class="w"> </span><span class="nb">enable</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Some notes about the command:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">--workers</span></code> affects the number of threads that build the engine file and does not necessarily need to match
|
||
the TP size. Make sure to set the tensor parallelism in the <code class="docutils literal notranslate"><span class="pre">$model_cfg</span></code> JSON file. See <a class="reference internal" href="#network-configuration-files"><span class="xref myst">below</span></a></p></li>
|
||
<li><p>You can run benchmarks for datasets that fit within the bounds of the <code class="docutils literal notranslate"><span class="pre">max_input_len</span></code> and <code class="docutils literal notranslate"><span class="pre">max_seq_len</span></code> parameters.</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="engine-configuration-files">
|
||
<h3>Engine Configuration Files<a class="headerlink" href="#engine-configuration-files" title="Link to this heading"></a></h3>
|
||
<p>In order to configure the TensorRT-LLM build process for benchmarking, you need to provide
|
||
<code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> a configuration file that specifies the following the network configuration, parallelism
|
||
mapping, and quantization options.</p>
|
||
<p>Below we document how to benchmark each model on an H100-HBM3-80GB system and reproduce the throughput
|
||
numbers we document on our [Performance section](#performance of-tensorrt-llm).</p>
|
||
<blockquote>
|
||
<div><p>[!Important]
|
||
In order to change the parallelism for a build, you need to modify the <code class="docutils literal notranslate"><span class="pre">mapping</span></code> dictionary in your configuration file. The settings
|
||
must conform to the following condition: <code class="docutils literal notranslate"><span class="pre">world_size</span> <span class="pre">==</span> <span class="pre">tp_size</span> <span class="pre">*</span> <span class="pre">pp_size</span></code>.</p>
|
||
</div></blockquote>
|
||
<blockquote>
|
||
<div><p>[!Note]
|
||
All configurations below are set to run utilizing FP8 by default. If you would like to run on an A100 system, see our notes about <a class="reference internal" href="#running-on-a100"><span class="xref myst">disabling FP8 quantization</span></a>.</p>
|
||
</div></blockquote>
|
||
</section>
|
||
<section id="network-configuration-files-and-settings">
|
||
<h3>Network Configuration Files and Settings<a class="headerlink" href="#network-configuration-files-and-settings" title="Link to this heading"></a></h3>
|
||
<p>Each network has its own configuration file. All networks are configured to run using FP8 quantization by default. Additionally, each network has a specific tuning for the
|
||
<code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code> and <code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code> parameters – at times varying for some
|
||
input and output sequence legnths within the same model.</p>
|
||
<blockquote>
|
||
<div><p>![Note]
|
||
General settings are specified by “General” in the “ISL/OSL” column. For special
|
||
cases, specific input and output sequence lengths will be specified.</p>
|
||
</div></blockquote>
|
||
<table>
|
||
<tr>
|
||
<td> Model </td> <td> Configuration File (FP8) </td>
|
||
</tr>
|
||
<tr>
|
||
<td> EleutherAI/gpt-j-6b </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"GPTJForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">28</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">50400</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gptj"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"gelu_new"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"rotary_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>1</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>128</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>1</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>128, 128</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> tiiuae/falcon-180B </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FalconForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"bfloat16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">232</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14848</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">65024</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"gelu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"use_parallel_embedding"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"embedding_sharding_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"share_embedding_table"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"bias"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"parallel_attention"</span><span class="p">:</span><span class="w"> </span><span class="kc">true</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"new_decoder_architecture"</span><span class="p">:</span><span class="w"> </span><span class="kc">true</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>4</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>4096</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>8</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Llama-2-7b-hf </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">11008</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">10000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>1</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>4096</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Llama-2-70b-hf </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">28672</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">10000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>2</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>4</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>4096</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>4</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>256</p></td>
|
||
<td><p>128, 4096</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>8</p></td>
|
||
<td><p>16384</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>8</p></td>
|
||
<td><p>16384</p></td>
|
||
<td><p>1024</p></td>
|
||
<td><p>128, 2048</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Meta-Llama-3-8B </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">128256</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14336</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rope_theta"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>1</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> meta-llama/Meta-Llama-3-70B </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">128256</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">28672</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rope_theta"</span><span class="p">:</span><span class="w"> </span><span class="mf">500000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>4</p></td>
|
||
<td><p>1024</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>8</p></td>
|
||
<td><p>16384</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> mistralai/Mixtral-8x7B-v0.1 </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"MixtralForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">32768</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"head_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">128</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"swiglu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14336</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"moe_num_experts"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"moe_top_k"</span><span class="p">:</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">1000000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rope_theta"</span><span class="p">:</span><span class="w"> </span><span class="mf">1000000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"kv_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>2</p></td>
|
||
<td><p>3072</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>4</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td> mistralai/Mistral-7B-v0.1 </td>
|
||
<td>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"MistralForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">32768</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"logits_dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float32"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"use_parallel_embedding"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"embedding_sharding_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"share_embedding_table"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14336</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"use_prompt_tuning"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">}</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_num_tokens</span></code></p></th>
|
||
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">$max_batch_size</span></code></p></th>
|
||
<th class="head"><p>ISL/OSL</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>1</p></td>
|
||
<td><p>8192</p></td>
|
||
<td><p>4098</p></td>
|
||
<td><p>General</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
</section>
|
||
<section id="running-on-a100">
|
||
<h3>Running on A100<a class="headerlink" href="#running-on-a100" title="Link to this heading"></a></h3>
|
||
<p>To run the benchmarks on A100, you will need to undefine or remove the following
|
||
quantization fields from each config json file, because FP8 computation is a feature in H100 and newer GPUs.</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="preparing-a-dataset">
|
||
<h2>Preparing a Dataset<a class="headerlink" href="#preparing-a-dataset" title="Link to this heading"></a></h2>
|
||
<p>In order to prepare a dataset, you can use the provided <a class="reference download internal" download="" href="../_downloads/ea8faa5e98124e92f96b66dc586fb429/prepare_dataset.py"><span class="xref download myst">script</span></a>.
|
||
To generate a synthetic dataset, run the following command:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>benchmarks/cpp/prepare_dataset.py<span class="w"> </span>--output<span class="o">=</span><span class="nv">$dataset_file</span><span class="w"> </span>--tokenizer<span class="o">=</span><span class="nv">$model_name</span><span class="w"> </span>token-norm-dist<span class="w"> </span>--num-requests<span class="o">=</span><span class="nv">$num_requests</span><span class="w"> </span>--input-mean<span class="o">=</span><span class="nv">$isl</span><span class="w"> </span>--output-mean<span class="o">=</span><span class="nv">$osl</span><span class="w"> </span>--input-stdev<span class="o">=</span><span class="m">0</span><span class="w"> </span>--output-stdev<span class="o">=</span><span class="m">0</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The command will generate a JSON file located at the path specified <code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code> where all requests are of the same
|
||
input/output sequence length combinations. The script works by using the tokenizer to retrieve the vocabulary size and
|
||
randomly sample token IDs from it to create entirely random sequences. In the command above, all requests will be uniform
|
||
because the standard deviations for both input and output sequences are set to 0.</p>
|
||
<p>For each input and output sequence length combination, the table below details the <code class="docutils literal notranslate"><span class="pre">$num_requests</span></code> that were used. For
|
||
shorter input and output lengths, a larger number of messages were used to guarantee that the system hit a steady state
|
||
because requests enter and exit the system at a much faster rate. For longer input/output sequence lengths, requests
|
||
remain in the system longer and therefore require less requests to achieve steady state.</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Input Length</p></th>
|
||
<th class="head"><p>Output Length</p></th>
|
||
<th class="head"><p>$num_requests</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>128</p></td>
|
||
<td><p>128</p></td>
|
||
<td><p>30000</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>128</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>3000</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>128</p></td>
|
||
<td><p>4096</p></td>
|
||
<td><p>1500</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>2048</p></td>
|
||
<td><p>128</p></td>
|
||
<td><p>3000</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>2048</p></td>
|
||
<td><p>2048</p></td>
|
||
<td><p>1500</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="running-the-benchmark">
|
||
<h2>Running the Benchmark<a class="headerlink" href="#running-the-benchmark" title="Link to this heading"></a></h2>
|
||
<p>To run the benchmark with the generated data set, simply run the following command from the root of the
|
||
TensorRT-LLM repository. See the <a class="reference internal" href="#variables"><span class="xref myst">variables</span></a> section for reference on variable values.</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>mpirun<span class="w"> </span>-n<span class="w"> </span><span class="nv">$tp_size</span><span class="w"> </span>--allow-run-as-root<span class="w"> </span>--oversubscribe<span class="w"> </span>cpp/build/benchmarks/gptManagerBenchmark<span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_dir</span><span class="w"> </span>--type<span class="w"> </span>IFB<span class="w"> </span>--dataset<span class="w"> </span><span class="nv">$dataset_file</span><span class="w"> </span>--eos_id<span class="w"> </span>-1<span class="w"> </span>--scheduler_policy<span class="w"> </span>guaranteed_no_evict<span class="w"> </span>--kv_cache_free_gpu_mem_fraction<span class="w"> </span><span class="m">0</span>.99<span class="w"> </span>--output_csv<span class="w"> </span>result.csv<span class="w"> </span>--request_rate<span class="w"> </span>-1.0<span class="w"> </span>--enable_chunked_context<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">0</span>
|
||
</pre></div>
|
||
</div>
|
||
<blockquote>
|
||
<div><p>[!Warning] GH200 benchmarks
|
||
For GH200 benchmarks, the command above must be modified to use <code class="docutils literal notranslate"><span class="pre">--kv_cache_free_gpu_mem_fraction</span> <span class="pre">0.95</span></code> to avoid an out of memory scenario.</p>
|
||
</div></blockquote>
|
||
<p>The command will run the <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code> binary that will report the throughput and other metrics as part of its output
|
||
that can be compared with the table in the <a class="reference internal" href="#peak-throughput"><span class="xref myst">Performance section</span></a> of this README.</p>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="../advanced/expert-parallelism.html" class="btn btn-neutral float-left" title="Expert Parallelism in TensorRT-LLM" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="perf-best-practices.html" class="btn btn-neutral float-right" title="Best Practices for Tuning the Performance of TensorRT-LLM" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<jinja2.runtime.BlockReference object at 0x7fef8e1b7ca0>
|
||
|
||
<div class="footer">
|
||
<p>
|
||
Copyright © 2024 NVIDIA Corporation
|
||
</p>
|
||
<p>
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Privacy Policy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Manage My Privacy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Accessibility</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Product Security</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Contact</a>
|
||
</p>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |