mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
1742 lines
105 KiB
HTML
1742 lines
105 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="./">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>Performance of TensorRT-LLM — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
|
||
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="_static/doctools.js?v=888ff710"></script>
|
||
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="next" title="How to debug" href="2023-05-19-how-to-debug.html" />
|
||
<link rel="prev" title="Build from Source" href="build_from_source.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="architecture.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt_runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="batch_manager.html">The Batch Manager in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="inference_request.html">Inference Request</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt_attention.html">Multi-head, Multi-query and Group-query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="build_from_source.html">Build from Source</a></li>
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Performance of TensorRT-LLM</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#methodology">Methodology</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#peak-throughput">Peak Throughput</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#h200-gpus-fp8">H200 GPUs (FP8)</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#h100-gpus-fp8">H100 GPUs (FP8)</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#l40s-gpus-fp8">L40S GPUs (FP8)</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#a100-gpus-fp16">A100 GPUs (FP16)</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#low-latency">Low Latency<sup>**</sup></a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id1">H200 GPUs (FP8)</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id2">H100 GPUs (FP8)</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id3">L40S GPUs (FP8)</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#id4">A100 GPUs (FP16)</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#known-issues">Known Issues</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#fused-matmul-gated-silu-llama">Fused Matmul + Gated-SiLU (LLaMA)</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#reproducing-benchmarked-results">Reproducing Benchmarked Results</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#building-the-tensorrt-llm-container">Building the TensorRT-LLM Container</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#id5">Methodology</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#engine-building-setups">Engine Building Setups</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#running-on-a100">Running on A100</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#reproducing-first-token-latency">Reproducing First Token Latency</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#benchmarking-per-model">Benchmarking per Model</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#gpt-j-6b">GPT-J 6B</a><ul>
|
||
<li class="toctree-l4"><a class="reference internal" href="#throughput-benchmark">Throughput Benchmark</a></li>
|
||
<li class="toctree-l4"><a class="reference internal" href="#first-token-latency-benchmark">First Token Latency Benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#llama2-7b">Llama2-7b</a><ul>
|
||
<li class="toctree-l4"><a class="reference internal" href="#id6">Throughput Benchmark</a></li>
|
||
<li class="toctree-l4"><a class="reference internal" href="#id7">First Token Latency Benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#llama2-70b">Llama2-70b</a><ul>
|
||
<li class="toctree-l4"><a class="reference internal" href="#id8">Throughput Benchmark</a></li>
|
||
<li class="toctree-l4"><a class="reference internal" href="#id9">First Token Latency Benchmark</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#falcon-180b">Falcon-180B</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="2023-05-19-how-to-debug.html">How to debug</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="2023-05-17-how-to-add-a-new-model.html">How to add a new model</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="new_workflow.html">New Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="perf_best_practices.html">Best Practices for Tuning the Performance of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="performance_analysis.html">Performance Analysis of TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.layers.html">Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.models.html">Models</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">Performance of TensorRT-LLM</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="_sources/performance.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<section id="performance-of-tensorrt-llm">
|
||
<h1>Performance of TensorRT-LLM<a class="headerlink" href="#performance-of-tensorrt-llm" title="Link to this heading"></a></h1>
|
||
<p>This document summarizes performance measurements of TensorRT-LLM on H100
|
||
(Hopper), L40S (Ada) and A100 (Ampere) GPUs for a few key models.</p>
|
||
<p>The data in the following tables is provided as a reference point to help users
|
||
validate observed performance. It should not be considered as the peak
|
||
performance that can be delivered by TensorRT-LLM.</p>
|
||
<section id="methodology">
|
||
<h2>Methodology<a class="headerlink" href="#methodology" title="Link to this heading"></a></h2>
|
||
<p>The different performance numbers below were collected using the methodology
|
||
described in the benchmarks <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/benchmarks/">folder</a>.</p>
|
||
</section>
|
||
<section id="peak-throughput">
|
||
<h2>Peak Throughput<a class="headerlink" href="#peak-throughput" title="Link to this heading"></a></h2>
|
||
<p>The below tables provide reference data at large batch sizes, representing
|
||
high throughput offline tasks.</p>
|
||
<p>All data was generated using version 0.8.0</p>
|
||
<section id="h200-gpus-fp8">
|
||
<h3>H200 GPUs (FP8)<a class="headerlink" href="#h200-gpus-fp8" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-left"><p>Output Length</p></th>
|
||
<th class="head text-right"><p>Throughput (out tok/s/GPU)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1024</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>29,168</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>9,472</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>2,961</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>4,149</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>896</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>20,569</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>8,968</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>84</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>2,450</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>56</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>3,868</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>896</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>20,548</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>8,343</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>84</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>2,429</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>56</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>3,530</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>3,844</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>2</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>4,008</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>421</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,461</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1024</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>1,116</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1024</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>990</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>118</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>269</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="h100-gpus-fp8">
|
||
<h3>H100 GPUs (FP8)<a class="headerlink" href="#h100-gpus-fp8" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-left"><p>Output Length</p></th>
|
||
<th class="head text-right"><p>Throughput (out tok/s/GPU)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1024</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>27,357</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>7,831</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>2,661</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>3,409</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>896</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>20,517</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>8,619</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>2,438</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>56</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>3,733</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>896</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>20,241</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>6,922</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>2,170</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>56</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>2,816</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1024</p></td>
|
||
<td class="text-left"><p>2</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>3,269</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>2,718</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>96</p></td>
|
||
<td class="text-left"><p>2</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>347</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>2</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,020</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>1,048</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1024</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>836</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>114</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>250</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="l40s-gpus-fp8">
|
||
<h3>L40S GPUs (FP8)<a class="headerlink" href="#l40s-gpus-fp8" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-left"><p>Output Length</p></th>
|
||
<th class="head text-right"><p>Throughput (out tok/s/GPU)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>7,992</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,874</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>693</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>768</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>896</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>9,679</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>4,401</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>84</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>979</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>56</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,721</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>5,954</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,654</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>579</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>16</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>542</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>2</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>561</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>471</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>16</p></td>
|
||
<td class="text-left"><p>2</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>49</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>64</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>177</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>152</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>200</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>15</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>16</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>39</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="a100-gpus-fp16">
|
||
<h3>A100 GPUs (FP16)<a class="headerlink" href="#a100-gpus-fp16" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-left"><p>Output Length</p></th>
|
||
<th class="head text-right"><p>Throughput (out tok/s/GPU)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>512</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>6,810</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,658</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>631</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>16</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>692</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>896</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>6,472</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>120</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>3,812</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>84</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>734</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>56</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,607</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>5,353</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>1,518</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>547</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>16</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>613</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>565</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>595</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>66</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>32</p></td>
|
||
<td class="text-left"><p>4</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>185</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>193</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>256</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>203</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>16</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>20</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>(1) TP stands for Tensor Parallelism.</p>
|
||
</section>
|
||
</section>
|
||
<section id="low-latency">
|
||
<h2>Low Latency<sup>**</sup><a class="headerlink" href="#low-latency" title="Link to this heading"></a></h2>
|
||
<p>All data was generated using version 0.8.0
|
||
<sup> ** Low latency numbers will soon be updated to reflect real time latency with infight-batching.</sup></p>
|
||
<p>The below tables provide reference data at batch size 1 for first token
|
||
latency, representing end-user’s perceived latency for online streaming
|
||
tasks.</p>
|
||
<section id="id1">
|
||
<h3>H200 GPUs (FP8)<a class="headerlink" href="#id1" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-right"><p>1st Token Latency (ms)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>5.2</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>23.6</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>6.0</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>31.8</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>5.8</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>30.1</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>16.0</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>78.8</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>37.2</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>120.8</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="id2">
|
||
<h3>H100 GPUs (FP8)<a class="headerlink" href="#id2" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-right"><p>1st Token Latency (ms)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>5.7</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>23.8</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>6.6</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>32.6</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>6.4</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>31.0</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>17.0</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>84.4</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>39.7</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>128.0</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="id3">
|
||
<h3>L40S GPUs (FP8)<a class="headerlink" href="#id3" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-right"><p>1st Token Latency (ms)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>12.6</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>61.2</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>15.5</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>84.3</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>14.3</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>79.0</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>70.9</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>708.7</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>93.4</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>769.8</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="id4">
|
||
<h3>A100 GPUs (FP16)<a class="headerlink" href="#id4" title="Link to this heading"></a></h3>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head text-left"><p>Model</p></th>
|
||
<th class="head text-left"><p>Batch Size</p></th>
|
||
<th class="head text-left"><p>TP (1)</p></th>
|
||
<th class="head text-left"><p>Input Length</p></th>
|
||
<th class="head text-right"><p>1st Token Latency (ms)</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>14.1</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>GPT-J 6B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>102.8</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>16.4</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Mistral 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>128.7</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>16.1</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 7B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>120.5</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>35.6</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>LLaMA 70B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>235.1</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-left"><p></p></td>
|
||
<td class="text-right"><p></p></td>
|
||
</tr>
|
||
<tr class="row-even"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>128</p></td>
|
||
<td class="text-right"><p>76.5</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td class="text-left"><p>Falcon 180B</p></td>
|
||
<td class="text-left"><p>1</p></td>
|
||
<td class="text-left"><p>8</p></td>
|
||
<td class="text-left"><p>2048</p></td>
|
||
<td class="text-right"><p>463.0</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>(1) TP stands for Tensor Parallelism.</p>
|
||
</section>
|
||
</section>
|
||
<section id="known-issues">
|
||
<h2>Known Issues<a class="headerlink" href="#known-issues" title="Link to this heading"></a></h2>
|
||
<p>The following issues are being addressed to improve the efficiency of TensorRT-LLM.</p>
|
||
<section id="fused-matmul-gated-silu-llama">
|
||
<h3>Fused Matmul + Gated-SiLU (LLaMA)<a class="headerlink" href="#fused-matmul-gated-silu-llama" title="Link to this heading"></a></h3>
|
||
<p>The current implementation combines two Matmul operations into one Matmul followed by
|
||
a separate SwiGLU kernel (when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp</span></code> is enabled). The future release will
|
||
include a more efficient implementation that runs single Matmul + SwiGLU fused kernel.</p>
|
||
</section>
|
||
</section>
|
||
<section id="reproducing-benchmarked-results">
|
||
<h2>Reproducing Benchmarked Results<a class="headerlink" href="#reproducing-benchmarked-results" title="Link to this heading"></a></h2>
|
||
<section id="building-the-tensorrt-llm-container">
|
||
<h3>Building the TensorRT-LLM Container<a class="headerlink" href="#building-the-tensorrt-llm-container" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>In order to benchmark TensorRT-LLM, you will need to follow the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/../../README.md">Quick Start</a>
|
||
build process to create a baseline container for building a wheel. Additionally, the development
|
||
container needs a copy of the source code to build the wheel and the benchmarking script. Create the
|
||
right build environment, use the following :</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/NVIDIA/TensorRT-LLM.git
|
||
<span class="nb">cd</span><span class="w"> </span>TensorRT-LLM
|
||
git<span class="w"> </span>submodule<span class="w"> </span>update<span class="w"> </span>--init<span class="w"> </span>--recursive
|
||
git<span class="w"> </span>lfs<span class="w"> </span>install
|
||
git<span class="w"> </span>lfs<span class="w"> </span>pull
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>build
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span>
|
||
</pre></div>
|
||
</div>
|
||
<blockquote>
|
||
<div><p>[!WARNING]
|
||
If you have elevated privileges on your system, then skip the <code class="docutils literal notranslate"><span class="pre">make</span> <span class="pre">-C</span> <span class="pre">docker</span> <span class="pre">run</span> <span class="pre">LOCAL_USER=1</span></code>
|
||
command above as it may make it so that you cannot access some required system libraries within the
|
||
container because the build forces your UID and GID to match those that are set for your non-elevated
|
||
user. There are cases where the container will be booted as root (i.e. on some SLURM systems with
|
||
the pyxis plugin) which will cause libraries to be missing.</p>
|
||
</div></blockquote>
|
||
<p>If you are benchmarking in a shared environment, you need to specify the GPU indices that you would
|
||
like the container to use, otherwise the Makefile defaults to loading the container with all GPUs on
|
||
the system. For example, if you only have the 4 higher indices of GPUs on your system you can
|
||
configure it using the following example:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">NV_GPU</span><span class="o">=</span><span class="m">0</span>,1,2,3
|
||
make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">GPU_OPTS</span><span class="o">=</span><span class="s1">'--gpus \"device=${NV_GPU}\"'</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Additionally, if you’d like to mount external storage to access persistent storage, or previously
|
||
built engines, you can mount directories as follows (simply replace <code class="docutils literal notranslate"><span class="pre">source</span></code> and <code class="docutils literal notranslate"><span class="pre">destination</span></code> with
|
||
the appropriate paths):</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>make<span class="w"> </span>-C<span class="w"> </span>docker<span class="w"> </span>run<span class="w"> </span><span class="nv">LOCAL_USER</span><span class="o">=</span><span class="m">1</span><span class="w"> </span><span class="nv">DOCKER_RUN_ARGS</span><span class="o">=</span><span class="s2">"-v /source:/destination"</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Once the container starts, you’ll need to build the wheel and the benchmarking scripts. From the
|
||
code root (the default directory when the container is loaded), the following commands will build
|
||
the TensorRT-LLM wheel, install dependencies, and build the benchmark scripts:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>python3<span class="w"> </span>./scripts/build_wheel.py<span class="w"> </span>--benchmarks<span class="w"> </span>--trt_root<span class="w"> </span>/usr/local/tensorrt
|
||
pip<span class="w"> </span>install<span class="w"> </span>./build/tensorrt_llm*.whl
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="id5">
|
||
<h2>Methodology<a class="headerlink" href="#id5" title="Link to this heading"></a></h2>
|
||
<section id="engine-building-setups">
|
||
<h3>Engine Building Setups<a class="headerlink" href="#engine-building-setups" title="Link to this heading"></a></h3>
|
||
<p>Each engine needs to be built before they can be benchmarked, and requires the source code for each
|
||
of their respective build scripts. For smaller models, it is fine to build the engine on the fly in
|
||
container; however, for larger engines it is recommended to pre-build and mount a directory with the
|
||
engine because engine files are quite large and take time to repeatedly build. Additionally, built
|
||
engines can be used for input lengths, output lengths, and batch sizes <em>up to</em> their build options
|
||
meaning you can use an engine to benchmark multiple input configurations.</p>
|
||
<p>In order to benchmark the various networks, our engine building scheme is as follows:</p>
|
||
<ul class="simple">
|
||
<li><p>For the GPT-J, Llama2-7b, and Llama2-70b benchmarks were ran using a single-setting engine build
|
||
for each network configured for our maximum expected throughput.</p></li>
|
||
<li><p>For Falcon-180B, where memory limits and model size have a higher impact for running the model,
|
||
our benchmarks transition to a per-configuration engine build.</p></li>
|
||
</ul>
|
||
<p>Below we document how to benchmark each model on an H100-HBM3-80GB system and reproduce the throughput
|
||
numbers we document on our [Performance section](#performance of-tensorrt-llm).</p>
|
||
</section>
|
||
<section id="running-on-a100">
|
||
<h3>Running on A100<a class="headerlink" href="#running-on-a100" title="Link to this heading"></a></h3>
|
||
<p>To run the benchmarks below on A100, you will need to remove the below fp8 quantization field from each
|
||
config json file, because FP8 computation is a feature in H100 and newer GPUs.</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="reproducing-first-token-latency">
|
||
<h3>Reproducing First Token Latency<a class="headerlink" href="#reproducing-first-token-latency" title="Link to this heading"></a></h3>
|
||
<p>In order to test the latency to the first token, you can build the engines as specified below (or
|
||
with the tweaks specified above on A100) – once built as described in the
|
||
<a class="reference internal" href="#engine-building-setups"><span class="xref myst">build steps</span></a> above, you can then benchmark with a single output token in
|
||
order to find the time to first token latency. We provide the appropriate command lines below for
|
||
each of the benchmarked models, but you can use this same method to benchmark other models available
|
||
in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM">TensorRT-LLM</a>.</p>
|
||
</section>
|
||
</section>
|
||
<section id="benchmarking-per-model">
|
||
<h2>Benchmarking per Model<a class="headerlink" href="#benchmarking-per-model" title="Link to this heading"></a></h2>
|
||
<blockquote>
|
||
<div><p>[!WARNING]
|
||
In some cases, using Group Query Attention (GQA) can improve performance of some networks. These
|
||
kernels are currently experimental and not enabled by default. In order to enable them, simply run
|
||
<code class="docutils literal notranslate"><span class="pre">export</span> <span class="pre">TRTLLM_ENABLE_XQA=1</span></code> in your shell. The kernels are an inference runtime optimization, so
|
||
previously built engines should still function. For the benchmarks below, we have enabled GQA where
|
||
our tests displayed performance benefits. If your network is not listed below, be sure to try both
|
||
GQA-enabled and GQA-disabled configurations to find the configuration that works best.
|
||
For more details see our documentation about <a class="reference internal" href="gpt_attention.html#generation-phase"><span class="std std-ref">GPT Attention</span></a>.</p>
|
||
</div></blockquote>
|
||
<section id="gpt-j-6b">
|
||
<h3>GPT-J 6B<a class="headerlink" href="#gpt-j-6b" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>Prepare a config json file <code class="docutils literal notranslate"><span class="pre">/tmp/engines/gptj/ckpt_config.json</span></code>:</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"GPTJForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">28</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">50400</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gptj"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"gelu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"rotary_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Build an engine:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>trtllm-build<span class="w"> </span>--model_config<span class="w"> </span>/tmp/engines/gptj/ckpt_config.json<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--output_dir<span class="w"> </span>/tmp/engines/gptj<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--context_fmha<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--gpt_attention_plugin<span class="w"> </span>float16<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_batch_size<span class="w"> </span><span class="m">64</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_input_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_output_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--strongly_typed
|
||
</pre></div>
|
||
</div>
|
||
<section id="throughput-benchmark">
|
||
<h4>Throughput Benchmark<a class="headerlink" href="#throughput-benchmark" title="Link to this heading"></a></h4>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"64:128,128"</span><span class="w"> </span><span class="s2">"64:128,2048"</span><span class="w"> </span><span class="s2">"64:2048,128"</span><span class="w"> </span><span class="s2">"64:2048,2048"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="nv">$in_out_dims</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>gptj<span class="w"> </span>--engine_dir<span class="w"> </span>/tmp/engines/gptj/<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="nv">$in_out_dims</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="first-token-latency-benchmark">
|
||
<h4>First Token Latency Benchmark<a class="headerlink" href="#first-token-latency-benchmark" title="Link to this heading"></a></h4>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"64:128,1"</span><span class="w"> </span><span class="s2">"64:2048,1"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="nv">$in_out_dims</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>gptj<span class="w"> </span>--engine_dir<span class="w"> </span>/tmp/engines/gptj/<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="nv">$in_out_dims</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="llama2-7b">
|
||
<h3>Llama2-7b<a class="headerlink" href="#llama2-7b" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>Prepare a config json file <code class="docutils literal notranslate"><span class="pre">/tmp/engines/llama/7b/ckpt_config.json</span></code>:</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">11008</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">32</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">10000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">}</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Build an engine:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>examples/llama/requirements.txt
|
||
trtllm-build<span class="w"> </span>--model_config<span class="w"> </span>/tmp/engines/llama/7b/ckpt_config.json<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--output_dir<span class="w"> </span>/tmp/engines/llama/7b<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--remove_input_padding<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--context_fmha<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--gpt_attention_plugin<span class="w"> </span>float16<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_batch_size<span class="w"> </span><span class="m">64</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_input_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_output_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--strongly_typed
|
||
</pre></div>
|
||
</div>
|
||
<section id="id6">
|
||
<h4>Throughput Benchmark<a class="headerlink" href="#id6" title="Link to this heading"></a></h4>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"64:128,128"</span><span class="w"> </span><span class="s2">"64:128,2048"</span><span class="w"> </span><span class="s2">"64:2048,128"</span><span class="w"> </span><span class="s2">"32:2048,2048"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="nv">$in_out_dims</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>llama<span class="w"> </span>--engine_dir<span class="w"> </span>/tmp/engines/llama/7b<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="nv">$in_out_dims</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="id7">
|
||
<h4>First Token Latency Benchmark<a class="headerlink" href="#id7" title="Link to this heading"></a></h4>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"64:128,1"</span><span class="w"> </span><span class="s2">"32:2048,1"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="nv">$in_out_dims</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>llama<span class="w"> </span>--engine_dir<span class="w"> </span>/tmp/engines/llama/7b<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="nv">$in_out_dims</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="llama2-70b">
|
||
<h3>Llama2-70b<a class="headerlink" href="#llama2-70b" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>Prepare a config json file <code class="docutils literal notranslate"><span class="pre">/tmp/engines/llama/70b/ckpt_config.json</span></code>:</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"LlamaForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"float16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">64</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8192</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"intermediate_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">28672</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">32000</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">4096</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"silu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_base"</span><span class="p">:</span><span class="w"> </span><span class="mf">10000.0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"rotary_scaling"</span><span class="p">:</span><span class="w"> </span><span class="kc">null</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">}</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Build an engine:</p>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>examples/llama/requirements.txt
|
||
trtllm-build<span class="w"> </span>--model_config<span class="w"> </span>/tmp/engines/llama/70b/ckpt_config.json<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--output_dir<span class="w"> </span>/tmp/engines/llama/70b<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--workers<span class="w"> </span><span class="m">4</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--remove_input_padding<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--context_fmha<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--gpt_attention_plugin<span class="w"> </span>float16<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_batch_size<span class="w"> </span><span class="m">64</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_input_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_output_len<span class="w"> </span><span class="m">2048</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--strongly_typed
|
||
</pre></div>
|
||
</div>
|
||
<section id="id8">
|
||
<h4>Throughput Benchmark<a class="headerlink" href="#id8" title="Link to this heading"></a></h4>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">TRTLLM_ENABLE_XQA</span><span class="o">=</span><span class="m">1</span>
|
||
<span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"64:128,128"</span><span class="w"> </span><span class="s2">"64:128,2048"</span><span class="w"> </span><span class="s2">"64:2048,128"</span><span class="w"> </span><span class="s2">"64:2048,2048"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="nv">$in_out_dims</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span>mpirun<span class="w"> </span>-n<span class="w"> </span><span class="m">4</span><span class="w"> </span>--allow-run-as-root<span class="w"> </span>--oversubscribe<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>llama<span class="w"> </span>--engine_dir<span class="w"> </span>/tmp/engines/llama/70b<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="nv">$in_out_dims</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="id9">
|
||
<h4>First Token Latency Benchmark<a class="headerlink" href="#id9" title="Link to this heading"></a></h4>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">TRTLLM_ENABLE_XQA</span><span class="o">=</span><span class="m">1</span>
|
||
<span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"64:128,1"</span><span class="w"> </span><span class="s2">"64:128,1"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="nv">$in_out_dims</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span>mpirun<span class="w"> </span>-n<span class="w"> </span><span class="m">4</span><span class="w"> </span>--allow-run-as-root<span class="w"> </span>--oversubscribe<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>llama<span class="w"> </span>--engine_dir<span class="w"> </span>/tmp/engines/llama/70b<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="nv">$in_out_dims</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="falcon-180b">
|
||
<h3>Falcon-180B<a class="headerlink" href="#falcon-180b" title="Link to this heading"></a></h3>
|
||
<hr class="docutils" />
|
||
<p>Benchmarking Falcon-180B requires a custom engine per batch size, input/output sequence length due
|
||
to the large footprint of the model and the large input size of 2048. You can build and benchmark
|
||
each engine one at a time with the following loop.</p>
|
||
<p>Prepare a config json file <code class="docutils literal notranslate"><span class="pre">/tmp/engines/falcon/180b/ckpt_config.json</span></code>:</p>
|
||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"architecture"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FalconForCausalLM"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"dtype"</span><span class="p">:</span><span class="w"> </span><span class="s2">"bfloat16"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_hidden_layers"</span><span class="p">:</span><span class="w"> </span><span class="mi">80</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_attention_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">232</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"num_key_value_heads"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">14848</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"norm_epsilon"</span><span class="p">:</span><span class="w"> </span><span class="mf">1e-05</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"vocab_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">65024</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"position_embedding_type"</span><span class="p">:</span><span class="w"> </span><span class="s2">"rope_gpt_neox"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"max_position_embeddings"</span><span class="p">:</span><span class="w"> </span><span class="mi">2048</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"hidden_act"</span><span class="p">:</span><span class="w"> </span><span class="s2">"gelu"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"use_parallel_embedding"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"embedding_sharding_dim"</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"share_embedding_table"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"quantization"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"kv_cache_quant_algo"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FP8"</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"mapping"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="nt">"world_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"tp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"pp_size"</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
|
||
<span class="w"> </span><span class="p">},</span>
|
||
<span class="w"> </span><span class="nt">"bias"</span><span class="p">:</span><span class="w"> </span><span class="kc">false</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"parallel_attention"</span><span class="p">:</span><span class="w"> </span><span class="kc">true</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="nt">"new_decoder_architecture"</span><span class="p">:</span><span class="w"> </span><span class="kc">true</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">TRTLLM_ENABLE_XQA</span><span class="o">=</span><span class="m">1</span>
|
||
<span class="c1"># Benchmark specific batch size:isl:osl combinations.</span>
|
||
<span class="nv">in_out_sizes</span><span class="o">=(</span><span class="s2">"96:128,128"</span><span class="w"> </span><span class="s2">"96:128,2048"</span><span class="w"> </span><span class="s2">"64:2048,128"</span><span class="o">)</span>
|
||
<span class="k">for</span><span class="w"> </span>in_out<span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="si">${</span><span class="nv">in_out_sizes</span><span class="p">[@]</span><span class="si">}</span>
|
||
<span class="k">do</span>
|
||
<span class="w"> </span><span class="nv">batch_size</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">in_out_dims</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">':'</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">isl</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out_dims</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">','</span><span class="w"> </span><span class="s1">'{ print $1 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">osl</span><span class="o">=</span><span class="k">$(</span><span class="nb">echo</span><span class="w"> </span><span class="nv">$in_out_dims</span><span class="w"> </span><span class="p">|</span><span class="w"> </span>awk<span class="w"> </span>-F<span class="s1">','</span><span class="w"> </span><span class="s1">'{ print $2 }'</span><span class="k">)</span>
|
||
<span class="w"> </span><span class="nv">engine_path</span><span class="o">=</span><span class="s2">"/tmp/engines/falcon/180b/</span><span class="si">${</span><span class="nv">batch_size</span><span class="si">}</span><span class="s2">_</span><span class="si">${</span><span class="nv">isl</span><span class="si">}</span><span class="s2">_</span><span class="si">${</span><span class="nv">osl</span><span class="si">}</span><span class="s2">"</span>
|
||
<span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"BS: </span><span class="nv">$batch_size</span><span class="s2">, ISL/OSL: </span><span class="si">${</span><span class="nv">isl</span><span class="si">}</span><span class="s2">,</span><span class="si">${</span><span class="nv">osl</span><span class="si">}</span><span class="s2">"</span>
|
||
|
||
<span class="w"> </span><span class="c1"># Build the specific engine for the BS,ISL,OSL combination</span>
|
||
<span class="w"> </span>trtllm-build<span class="w"> </span>--model_config<span class="w"> </span>/tmp/engines/falcon/180b/ckpt_config.json<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--output_dir<span class="w"> </span><span class="nv">$engine_path</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--workers<span class="w"> </span><span class="m">8</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--remove_input_padding<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--context_fmha<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--gpt_attention_plugin<span class="w"> </span>bfloat16<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--gemm_plugin<span class="w"> </span>bfloat16<span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--paged_kv_cache<span class="w"> </span><span class="nb">enable</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_input_len<span class="w"> </span><span class="nv">$isl</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--max_output_len<span class="w"> </span><span class="nv">$osl</span><span class="w"> </span><span class="se">\</span>
|
||
<span class="w"> </span>--strongly_typed
|
||
|
||
<span class="w"> </span><span class="c1"># Throughput benchmark</span>
|
||
<span class="w"> </span>mpirun<span class="w"> </span>-n<span class="w"> </span><span class="m">8</span><span class="w"> </span>--allow-run-as-root<span class="w"> </span>--oversubscribe<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>falcon<span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_path</span><span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="s2">"</span><span class="si">${</span><span class="nv">isl</span><span class="si">}</span><span class="s2">,</span><span class="si">${</span><span class="nv">osl</span><span class="si">}</span><span class="s2">"</span>
|
||
<span class="w"> </span><span class="c1"># Time to first token benchmark</span>
|
||
<span class="w"> </span>mpirun<span class="w"> </span>-n<span class="w"> </span><span class="m">8</span><span class="w"> </span>--allow-run-as-root<span class="w"> </span>--oversubscribe<span class="w"> </span>./cpp/build/benchmarks/gptSessionBenchmark<span class="w"> </span>--model<span class="w"> </span>falcon<span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_path</span><span class="w"> </span>--warm_up<span class="w"> </span><span class="m">1</span><span class="w"> </span>--batch_size<span class="w"> </span><span class="nv">$batch_size</span><span class="w"> </span>--duration<span class="w"> </span><span class="m">0</span><span class="w"> </span>--num_runs<span class="w"> </span><span class="m">5</span><span class="w"> </span>--input_output_len<span class="w"> </span><span class="s2">"</span><span class="si">${</span><span class="nv">isl</span><span class="si">}</span><span class="s2">,1"</span>
|
||
|
||
<span class="w"> </span><span class="c1"># The Falcon-180b engine is quite large, remove after the benchmark to free up space</span>
|
||
<span class="w"> </span><span class="c1"># Remove this line if you'd like to save the engines.</span>
|
||
<span class="w"> </span>rm<span class="w"> </span>-r<span class="w"> </span><span class="nv">$engine_path</span>
|
||
<span class="k">done</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="build_from_source.html" class="btn btn-neutral float-left" title="Build from Source" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="2023-05-19-how-to-debug.html" class="btn btn-neutral float-right" title="How to debug" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<p>© Copyright 2023, NVidia.</p>
|
||
</div>
|
||
|
||
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
|
||
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
|
||
provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |