mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
2818 lines
89 KiB
HTML
2818 lines
89 KiB
HTML
|
|
|
|
<!DOCTYPE html>
|
|
<html class="writer-html5" lang="en" data-content_root="../">
|
|
<head>
|
|
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
<title>Overview — tensorrt_llm documentation</title>
|
|
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
|
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
|
|
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
|
|
|
|
|
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
|
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
|
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
|
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
|
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
|
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
|
<script src="../_static/copybutton.js?v=65e89d2a"></script>
|
|
<script src="../_static/js/theme.js"></script>
|
|
<link rel="index" title="Index" href="../genindex.html" />
|
|
<link rel="search" title="Search" href="../search.html" />
|
|
<link rel="next" title="TensorRT-LLM Benchmarking" href="perf-benchmarking.html" />
|
|
<link rel="prev" title="Disaggregated-Service (experimental)" href="../advanced/disaggregated-service.html" />
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav">
|
|
<div class="wy-grid-for-nav">
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
|
<div class="wy-side-scroll">
|
|
<div class="wy-side-nav-search" >
|
|
|
|
|
|
|
|
<a href="../index.html" class="icon icon-home">
|
|
tensorrt_llm
|
|
</a>
|
|
<div role="search">
|
|
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
|
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
|
<ul class="current">
|
|
<li class="toctree-l1 current"><a class="current reference internal" href="#">Overview</a><ul>
|
|
<li class="toctree-l2"><a class="reference internal" href="#known-issues">Known Issues</a><ul>
|
|
<li class="toctree-l3"><a class="reference internal" href="#known-allreduce-performance-issue-on-amd-based-cpu-platforms">Known AllReduce performance issue on AMD-based CPU platforms</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#fused-matmul-gated-silu-llama">Fused Matmul + Gated-SiLU (LLaMA)</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#trtllm-bench-has-known-issues-on-gh200">Trtllm-bench has known issues on GH200</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#throughput-measurements">Throughput Measurements</a></li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#reproducing-benchmarked-results">Reproducing Benchmarked Results</a><ul>
|
|
<li class="toctree-l3"><a class="reference internal" href="#commands">Commands</a><ul>
|
|
<li class="toctree-l4"><a class="reference internal" href="#for-non-gh200-systems">For non GH200 systems</a></li>
|
|
<li class="toctree-l4"><a class="reference internal" href="#for-gh200-systems-only">For GH200 systems only</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#variables">Variables</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#preparing-a-dataset">Preparing a Dataset</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#engine-building">Engine Building</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#running-the-benchmark">Running the Benchmark</a></li>
|
|
<li class="toctree-l3"><a class="reference internal" href="#id1">For non GH200 systems</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l2"><a class="reference internal" href="#online-serving-measurements">Online Serving Measurements</a><ul>
|
|
<li class="toctree-l3"><a class="reference internal" href="#id2">For GH200 systems only</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
<li class="toctree-l1"><a class="reference internal" href="perf-benchmarking.html">Benchmarking</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="perf-best-practices.html">Best Practices</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="perf-analysis.html">Performance Analysis</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
|
</ul>
|
|
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
|
</ul>
|
|
|
|
</div>
|
|
</div>
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="../index.html">tensorrt_llm</a>
|
|
</nav>
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="Page navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
|
<li class="breadcrumb-item active">Overview</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
<a href="../_sources/performance/perf-overview.md.txt" rel="nofollow"> View page source</a>
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
</div>
|
|
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
|
<div itemprop="articleBody">
|
|
|
|
<blockquote id="perf-overview">
|
|
<div><p>[!IMPORTANT]
|
|
As of TensorRT-LLM v0.10, these performance benchmarks have changed methodology to utilize in-flight batching and
|
|
no longer utilize static benchmarking. These numbers are initial measurements and are expected to improve in future
|
|
releases.</p>
|
|
</div></blockquote>
|
|
<section id="overview">
|
|
<h1>Overview<a class="headerlink" href="#overview" title="Link to this heading"></a></h1>
|
|
<p>This document summarizes performance measurements of TensorRT-LLM on H100
|
|
(Hopper), GH200 (Grace + Hopper), L40S (Ada) and A100 (Ampere) GPUs for a few key models.</p>
|
|
<p>The data in the following tables is provided as a reference point to help users
|
|
validate observed performance. It should not be considered as the peak
|
|
performance that can be delivered by TensorRT-LLM.</p>
|
|
<section id="known-issues">
|
|
<h2>Known Issues<a class="headerlink" href="#known-issues" title="Link to this heading"></a></h2>
|
|
<p>The following issues are being addressed to improve the efficiency of TensorRT-LLM.</p>
|
|
<section id="known-allreduce-performance-issue-on-amd-based-cpu-platforms">
|
|
<h3>Known AllReduce performance issue on AMD-based CPU platforms<a class="headerlink" href="#known-allreduce-performance-issue-on-amd-based-cpu-platforms" title="Link to this heading"></a></h3>
|
|
<p>We observed a performance issue on NCCL 2.23.4, which can be workarounded by setting <code class="docutils literal notranslate"><span class="pre">NCCL_P2P_LEVEL</span></code> to <code class="docutils literal notranslate"><span class="pre">SYS</span></code>:</p>
|
|
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">NCCL_P2P_LEVEL</span><span class="o">=</span><span class="n">SYS</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Multi-GPU cases could be affected due to the issue, which is being addressed.</p>
|
|
</section>
|
|
<section id="fused-matmul-gated-silu-llama">
|
|
<h3>Fused Matmul + Gated-SiLU (LLaMA)<a class="headerlink" href="#fused-matmul-gated-silu-llama" title="Link to this heading"></a></h3>
|
|
<p>The current implementation combines two Matmul operations into one Matmul followed by
|
|
a separate SwiGLU kernel (when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp=enable</span></code> is enabled). There is also a more
|
|
efficient implementation that runs single Matmul + SwiGLU fused kernel for FP8 on Hopper
|
|
(when <code class="docutils literal notranslate"><span class="pre">--use_fused_mlp=enable</span> <span class="pre">--gemm_swiglu_plugin</span> <span class="pre">fp8</span></code> is enabled). The gemm_swiglu_plugin
|
|
will support more data types and GPU architectures in the future release.</p>
|
|
</section>
|
|
<section id="trtllm-bench-has-known-issues-on-gh200">
|
|
<h3>Trtllm-bench has known issues on GH200<a class="headerlink" href="#trtllm-bench-has-known-issues-on-gh200" title="Link to this heading"></a></h3>
|
|
<p>For release v0.15, on GH200 systems, we recommend using the legacy flow based on <em>gptManagerBenchmark</em> to measure performance.</p>
|
|
</section>
|
|
</section>
|
|
<section id="throughput-measurements">
|
|
<h2>Throughput Measurements<a class="headerlink" href="#throughput-measurements" title="Link to this heading"></a></h2>
|
|
<p>The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
|
|
and shows the throughput client-server scenario under maximum load.</p>
|
|
<p>The performance numbers below were collected using the steps described in this document.</p>
|
|
<p>Note that for GH200 tests, TRT-LLM engines were built using <em>trtllm-bench build</em> but benchmarked with <em>gptManagerBenchmark</em>.</p>
|
|
<p><strong>All data in the table below was generated using version 0.15.0 and presents token throughput in tokens/second.</strong></p>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>GPU</p></td>
|
|
<td><p></p></td>
|
|
<td><p>H100 80GB HBM3</p></td>
|
|
<td><p></p></td>
|
|
<td><p>A100-SXM4-80GB</p></td>
|
|
<td><p>A100-PCIE-80GB</p></td>
|
|
<td><p>L40S</p></td>
|
|
<td><p>GH200 96GB HBM3 CG1</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>Precision</p></td>
|
|
<td><p></p></td>
|
|
<td><p>FP8</p></td>
|
|
<td><p>Mixed</p></td>
|
|
<td><p>Mixed</p></td>
|
|
<td><p>Mixed</p></td>
|
|
<td><p>FP8</p></td>
|
|
<td><p>FP8</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>Model</p></td>
|
|
<td><p>TP Size</p></td>
|
|
<td><p>Runtime Input/Output Lengths</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA v3 70B</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>3197.73</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4023.31</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>826.72</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1855.98</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>915.15</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>658.87</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1483.67</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>772.64</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1587.16</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>331.26</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>425.89</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>383.46</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>823.43</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>217.12</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>391.38</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>6529.47</p></td>
|
|
<td><p>3137.86</p></td>
|
|
<td><p>1316.68</p></td>
|
|
<td><p>792.95</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>6008.16</p></td>
|
|
<td><p>783.76</p></td>
|
|
<td><p>532.07</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>3561.24</p></td>
|
|
<td><p>404.23</p></td>
|
|
<td><p>285.37</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>4792.7</p></td>
|
|
<td><p>658.7</p></td>
|
|
<td><p>436.46</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>4221.4</p></td>
|
|
<td><p>759.56</p></td>
|
|
<td><p>484.59</p></td>
|
|
<td><p>268.09</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>773.11</p></td>
|
|
<td><p>318.58</p></td>
|
|
<td><p>147.22</p></td>
|
|
<td><p>96.65</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>2648.62</p></td>
|
|
<td><p>373.71</p></td>
|
|
<td><p>255.21</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>905.34</p></td>
|
|
<td><p>224.99</p></td>
|
|
<td><p>123.5</p></td>
|
|
<td><p>75.54</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>10848.71</p></td>
|
|
<td><p>6387.29</p></td>
|
|
<td><p>2713.51</p></td>
|
|
<td><p>1347.36</p></td>
|
|
<td><p>1474</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>10973.67</p></td>
|
|
<td><p>5767.81</p></td>
|
|
<td><p>2684.63</p></td>
|
|
<td><p>1414.31</p></td>
|
|
<td><p>1912.29</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>7426.74</p></td>
|
|
<td><p>3421.36</p></td>
|
|
<td><p>1914.57</p></td>
|
|
<td><p>1140.75</p></td>
|
|
<td><p>1357.84</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>9575.94</p></td>
|
|
<td><p>4311.78</p></td>
|
|
<td><p>2181.56</p></td>
|
|
<td><p>1276.59</p></td>
|
|
<td><p>1602.99</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>7234.67</p></td>
|
|
<td><p>4027.52</p></td>
|
|
<td><p>1876.99</p></td>
|
|
<td><p>927.93</p></td>
|
|
<td><p>1193.23</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>1318.11</p></td>
|
|
<td><p>781.29</p></td>
|
|
<td><p>319.91</p></td>
|
|
<td><p>161.66</p></td>
|
|
<td><p>174.02</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>5185.7</p></td>
|
|
<td><p>2584.66</p></td>
|
|
<td><p>1339.76</p></td>
|
|
<td><p>872.31</p></td>
|
|
<td><p>910.92</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>1568.88</p></td>
|
|
<td><p>855.16</p></td>
|
|
<td><p>388.86</p></td>
|
|
<td><p>216.5</p></td>
|
|
<td><p>242.62</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>15440.55</p></td>
|
|
<td><p>10966.81</p></td>
|
|
<td><p>4647.93</p></td>
|
|
<td><p>962.8</p></td>
|
|
<td><p>1381.32</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>16416.2</p></td>
|
|
<td><p>10270.37</p></td>
|
|
<td><p>5046.42</p></td>
|
|
<td><p>1487.53</p></td>
|
|
<td><p>2120.54</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>12247.71</p></td>
|
|
<td><p>6932.27</p></td>
|
|
<td><p>3672.17</p></td>
|
|
<td><p>1391.51</p></td>
|
|
<td><p>1855.21</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>14561.62</p></td>
|
|
<td><p>8967.15</p></td>
|
|
<td><p>4379.68</p></td>
|
|
<td><p>1205.63</p></td>
|
|
<td><p>1879.86</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>11226.01</p></td>
|
|
<td><p>6973.77</p></td>
|
|
<td><p>3236.83</p></td>
|
|
<td><p>883.65</p></td>
|
|
<td><p>1244.32</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>2057.59</p></td>
|
|
<td><p>1341.65</p></td>
|
|
<td><p>558.45</p></td>
|
|
<td><p>141.12</p></td>
|
|
<td><p>164.34</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>7813.57</p></td>
|
|
<td><p>4518.75</p></td>
|
|
<td><p>2395.15</p></td>
|
|
<td><p>769.53</p></td>
|
|
<td><p>1091.57</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>2564.74</p></td>
|
|
<td><p>1612.14</p></td>
|
|
<td><p>706.33</p></td>
|
|
<td><p>217.62</p></td>
|
|
<td><p>243.14</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA v3.1 8B</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>27792.16</p></td>
|
|
<td><p>16116.63</p></td>
|
|
<td><p>6552.62</p></td>
|
|
<td><p>5158.57</p></td>
|
|
<td><p>8982.97</p></td>
|
|
<td><p>30803.29</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>19965.18</p></td>
|
|
<td><p>9894.49</p></td>
|
|
<td><p>5220.03</p></td>
|
|
<td><p>4640.02</p></td>
|
|
<td><p>5297.21</p></td>
|
|
<td><p>20770.93</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>13222.06</p></td>
|
|
<td><p>5758.98</p></td>
|
|
<td><p>3326.45</p></td>
|
|
<td><p>2906.77</p></td>
|
|
<td><p>2989.17</p></td>
|
|
<td><p>12487.35</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>15782.2</p></td>
|
|
<td><p>7953.1</p></td>
|
|
<td><p>4191.62</p></td>
|
|
<td><p>3736.1</p></td>
|
|
<td><p>4263.97</p></td>
|
|
<td><p>19175.02</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>14797.28</p></td>
|
|
<td><p>7721.07</p></td>
|
|
<td><p>3753.46</p></td>
|
|
<td><p>3328.02</p></td>
|
|
<td><p>4013.95</p></td>
|
|
<td><p>15955.43</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>3496.41</p></td>
|
|
<td><p>1972.07</p></td>
|
|
<td><p>789.56</p></td>
|
|
<td><p>630.86</p></td>
|
|
<td><p>1055.55</p></td>
|
|
<td><p>4011.99</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>8980.42</p></td>
|
|
<td><p>4370.61</p></td>
|
|
<td><p>2366.86</p></td>
|
|
<td><p>2125.4</p></td>
|
|
<td><p>2162.8</p></td>
|
|
<td><p>9072.93</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>3477.61</p></td>
|
|
<td><p>1802.2</p></td>
|
|
<td><p>816.09</p></td>
|
|
<td><p>693.38</p></td>
|
|
<td><p>972.2</p></td>
|
|
<td><p>3957.15</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>1378.69</p></td>
|
|
<td><p>621.58</p></td>
|
|
<td><p>330.47</p></td>
|
|
<td><p>298.79</p></td>
|
|
<td><p>326.02</p></td>
|
|
<td><p>1459.86</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>LLaMA v3.1 70B</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>3173.65</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4108.23</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>804.73</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1940.33</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>981.15</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>652.24</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1526.49</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>775.07</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1575.4</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>328.44</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>453.06</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>388.02</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>838.55</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>217.98</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>383.32</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>124.38</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>6399.24</p></td>
|
|
<td><p>3143.32</p></td>
|
|
<td><p>1330.41</p></td>
|
|
<td><p>790.66</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>5920.14</p></td>
|
|
<td><p>784.73</p></td>
|
|
<td><p>532.31</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>3580.79</p></td>
|
|
<td><p>418.75</p></td>
|
|
<td><p>285.01</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>4775.52</p></td>
|
|
<td><p>660.68</p></td>
|
|
<td><p>437.64</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>4247.38</p></td>
|
|
<td><p>785.36</p></td>
|
|
<td><p>483.87</p></td>
|
|
<td><p>267.63</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>774.11</p></td>
|
|
<td><p>315.43</p></td>
|
|
<td><p>144.88</p></td>
|
|
<td><p>94.83</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>2667.23</p></td>
|
|
<td><p>384.36</p></td>
|
|
<td><p>259.65</p></td>
|
|
<td><p>137.09</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>901.84</p></td>
|
|
<td><p>210.7</p></td>
|
|
<td><p>124.33</p></td>
|
|
<td><p>76.77</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>410.93</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>10589.19</p></td>
|
|
<td><p>6392.74</p></td>
|
|
<td><p>2716.71</p></td>
|
|
<td><p>1192.33</p></td>
|
|
<td><p>1469.28</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>11063.97</p></td>
|
|
<td><p>5742.27</p></td>
|
|
<td><p>2663.76</p></td>
|
|
<td><p>1385.61</p></td>
|
|
<td><p>1911.43</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>7428.89</p></td>
|
|
<td><p>3457.03</p></td>
|
|
<td><p>1913.13</p></td>
|
|
<td><p>1206.15</p></td>
|
|
<td><p>1357.83</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>9504.33</p></td>
|
|
<td><p>4375.09</p></td>
|
|
<td><p>2193.81</p></td>
|
|
<td><p>1248.45</p></td>
|
|
<td><p>1599.38</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>7306.35</p></td>
|
|
<td><p>4075.52</p></td>
|
|
<td><p>1889.72</p></td>
|
|
<td><p>999.4</p></td>
|
|
<td><p>1187.23</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>1316.33</p></td>
|
|
<td><p>779.81</p></td>
|
|
<td><p>320.96</p></td>
|
|
<td><p>162.09</p></td>
|
|
<td><p>176.41</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>5166.41</p></td>
|
|
<td><p>2609.39</p></td>
|
|
<td><p>1341.99</p></td>
|
|
<td><p>874.11</p></td>
|
|
<td><p>909.3</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>1566.63</p></td>
|
|
<td><p>874.96</p></td>
|
|
<td><p>389.99</p></td>
|
|
<td><p>218.29</p></td>
|
|
<td><p>242.95</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>915.06</p></td>
|
|
<td><p>406.36</p></td>
|
|
<td><p>209.39</p></td>
|
|
<td><p>141.13</p></td>
|
|
<td><p>158.35</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>15427.05</p></td>
|
|
<td><p>10959.63</p></td>
|
|
<td><p>4595.66</p></td>
|
|
<td><p>943.87</p></td>
|
|
<td><p>1381.25</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>16533.07</p></td>
|
|
<td><p>10252.11</p></td>
|
|
<td><p>4967.17</p></td>
|
|
<td><p>1605.66</p></td>
|
|
<td><p>2157.58</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>12008.26</p></td>
|
|
<td><p>6915.81</p></td>
|
|
<td><p>3594.1</p></td>
|
|
<td><p>1449.32</p></td>
|
|
<td><p>1895.68</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>14508.43</p></td>
|
|
<td><p>8942.09</p></td>
|
|
<td><p>4349.21</p></td>
|
|
<td><p>1238.68</p></td>
|
|
<td><p>1877.86</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>11086.68</p></td>
|
|
<td><p>6983.63</p></td>
|
|
<td><p>3285.33</p></td>
|
|
<td><p>907.21</p></td>
|
|
<td><p>1242.34</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>2064.53</p></td>
|
|
<td><p>1351.25</p></td>
|
|
<td><p>556.48</p></td>
|
|
<td><p>140.49</p></td>
|
|
<td><p>163.53</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>7768.15</p></td>
|
|
<td><p>4515.31</p></td>
|
|
<td><p>2464.13</p></td>
|
|
<td><p>811.88</p></td>
|
|
<td><p>1092.72</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>2533.55</p></td>
|
|
<td><p>1589.18</p></td>
|
|
<td><p>700.7</p></td>
|
|
<td><p>212.07</p></td>
|
|
<td><p>242.61</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>1447.5</p></td>
|
|
<td><p>847.42</p></td>
|
|
<td><p>399.8</p></td>
|
|
<td><p>140.86</p></td>
|
|
<td><p>198.77</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>Mistral 7B</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>30177.4</p></td>
|
|
<td><p>17025.15</p></td>
|
|
<td><p>6968.4</p></td>
|
|
<td><p>5444.55</p></td>
|
|
<td><p>9526.7</p></td>
|
|
<td><p>33795.78</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>22060.45</p></td>
|
|
<td><p>10324.05</p></td>
|
|
<td><p>5556.98</p></td>
|
|
<td><p>4960.48</p></td>
|
|
<td><p>5669.19</p></td>
|
|
<td><p>22724.8</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>13773.03</p></td>
|
|
<td><p>6205.41</p></td>
|
|
<td><p>3430.11</p></td>
|
|
<td><p>3077.47</p></td>
|
|
<td><p>3091.88</p></td>
|
|
<td><p>13916.10</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>17229.29</p></td>
|
|
<td><p>8294.02</p></td>
|
|
<td><p>4339.77</p></td>
|
|
<td><p>3883.38</p></td>
|
|
<td><p>4498.74</p></td>
|
|
<td><p>20702.51</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>15428.87</p></td>
|
|
<td><p>7894.2</p></td>
|
|
<td><p>3874.65</p></td>
|
|
<td><p>3433.27</p></td>
|
|
<td><p>4118.6</p></td>
|
|
<td><p>17061.12</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>3546.44</p></td>
|
|
<td><p>2001.13</p></td>
|
|
<td><p>793.57</p></td>
|
|
<td><p>635.46</p></td>
|
|
<td><p>1067.47</p></td>
|
|
<td><p>4039.02</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>9118.64</p></td>
|
|
<td><p>4520.74</p></td>
|
|
<td><p>2440.45</p></td>
|
|
<td><p>2187.82</p></td>
|
|
<td><p>2231.66</p></td>
|
|
<td><p>9998.65</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>3493.52</p></td>
|
|
<td><p>1838.75</p></td>
|
|
<td><p>828.17</p></td>
|
|
<td><p>702.36</p></td>
|
|
<td><p>999.35</p></td>
|
|
<td><p>4042.82</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>1267.96</p></td>
|
|
<td><p>641</p></td>
|
|
<td><p>334.06</p></td>
|
|
<td><p>296.1</p></td>
|
|
<td><p>336.18</p></td>
|
|
<td><p>1521.67</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>Mixtral 8x7B</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>15882.61</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16515.3</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>8214.24</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>10956.79</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>4671.49</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>6489.02</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>6739.79</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8809.27</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>6787.62</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8402.89</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>1885.43</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1932.28</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>3725.12</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5248.95</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>1762.25</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2098.53</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>670.61</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>870.76</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>27155.63</p></td>
|
|
<td><p>15904.17</p></td>
|
|
<td><p>5758.21</p></td>
|
|
<td><p>3788.61</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>23009.9</p></td>
|
|
<td><p>7660.05</p></td>
|
|
<td><p>4365.92</p></td>
|
|
<td><p>2219.51</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>14095.62</p></td>
|
|
<td><p>4287.96</p></td>
|
|
<td><p>2502.13</p></td>
|
|
<td><p>1272.21</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>16785.63</p></td>
|
|
<td><p>6454.11</p></td>
|
|
<td><p>3618.34</p></td>
|
|
<td><p>1633.61</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>15867.12</p></td>
|
|
<td><p>6492.47</p></td>
|
|
<td><p>3316.43</p></td>
|
|
<td><p>1734.39</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>3367.65</p></td>
|
|
<td><p>1895.85</p></td>
|
|
<td><p>691.68</p></td>
|
|
<td><p>465.45</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>10464.57</p></td>
|
|
<td><p>3642.6</p></td>
|
|
<td><p>1990.95</p></td>
|
|
<td><p>1038.11</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>3591.62</p></td>
|
|
<td><p>1722.61</p></td>
|
|
<td><p>755.64</p></td>
|
|
<td><p>468.26</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>1739.08</p></td>
|
|
<td><p>655.5</p></td>
|
|
<td><p>334.67</p></td>
|
|
<td><p>187.43</p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>40731.73</p></td>
|
|
<td><p>28272.32</p></td>
|
|
<td><p>11612.27</p></td>
|
|
<td><p>6075.21</p></td>
|
|
<td><p>6756.75</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>41117.27</p></td>
|
|
<td><p>23327.39</p></td>
|
|
<td><p>11755.57</p></td>
|
|
<td><p>7851.32</p></td>
|
|
<td><p>7989.81</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>28143.35</p></td>
|
|
<td><p>13906.89</p></td>
|
|
<td><p>8052.85</p></td>
|
|
<td><p>5920.37</p></td>
|
|
<td><p>5655.07</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>34507.24</p></td>
|
|
<td><p>16964.37</p></td>
|
|
<td><p>9185.2</p></td>
|
|
<td><p>6243.72</p></td>
|
|
<td><p>6605.53</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>27614.12</p></td>
|
|
<td><p>16217.64</p></td>
|
|
<td><p>7640.13</p></td>
|
|
<td><p>4818.03</p></td>
|
|
<td><p>5132.48</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>5275.25</p></td>
|
|
<td><p>3416.82</p></td>
|
|
<td><p>1383.85</p></td>
|
|
<td><p>740</p></td>
|
|
<td><p>811.01</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>18441.12</p></td>
|
|
<td><p>10381.54</p></td>
|
|
<td><p>5403.69</p></td>
|
|
<td><p>3842.39</p></td>
|
|
<td><p>3837.68</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>6340.27</p></td>
|
|
<td><p>3689.37</p></td>
|
|
<td><p>1632.92</p></td>
|
|
<td><p>966.38</p></td>
|
|
<td><p>1072.16</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>3231.36</p></td>
|
|
<td><p>1717.02</p></td>
|
|
<td><p>856.62</p></td>
|
|
<td><p>619.01</p></td>
|
|
<td><p>655.74</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>51899.21</p></td>
|
|
<td><p>40517.74</p></td>
|
|
<td><p>18434.51</p></td>
|
|
<td><p>5573.24</p></td>
|
|
<td><p>6349.85</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>63701.21</p></td>
|
|
<td><p>40322.45</p></td>
|
|
<td><p>22120.7</p></td>
|
|
<td><p>8657.63</p></td>
|
|
<td><p>9696.71</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>47833.64</p></td>
|
|
<td><p>27121.19</p></td>
|
|
<td><p>16280.11</p></td>
|
|
<td><p>7747.32</p></td>
|
|
<td><p>8038.78</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>53260.36</p></td>
|
|
<td><p>32190.46</p></td>
|
|
<td><p>18439.46</p></td>
|
|
<td><p>7393.45</p></td>
|
|
<td><p>8319.84</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>40321.28</p></td>
|
|
<td><p>27487.98</p></td>
|
|
<td><p>13842.01</p></td>
|
|
<td><p>5041.55</p></td>
|
|
<td><p>5593.52</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>7609.41</p></td>
|
|
<td><p>5396.72</p></td>
|
|
<td><p>2295.12</p></td>
|
|
<td><p>670.71</p></td>
|
|
<td><p>765.2</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>25624.61</p></td>
|
|
<td><p>17823.29</p></td>
|
|
<td><p>10114.34</p></td>
|
|
<td><p>4509.4</p></td>
|
|
<td><p>4791.64</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>9527.29</p></td>
|
|
<td><p>6475.64</p></td>
|
|
<td><p>3009.15</p></td>
|
|
<td><p>973.63</p></td>
|
|
<td><p>1094.62</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>5507.84</p></td>
|
|
<td><p>3156.06</p></td>
|
|
<td><p>1673.29</p></td>
|
|
<td><p>770.41</p></td>
|
|
<td><p>872.96</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>Mixtral 8x22B</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>128, 128</p></td>
|
|
<td><p>22834.12</p></td>
|
|
<td><p>16565.76</p></td>
|
|
<td><p>6914.09</p></td>
|
|
<td><p></p></td>
|
|
<td><p>2470.15</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 2048</p></td>
|
|
<td><p>24975.75</p></td>
|
|
<td><p>11676.16</p></td>
|
|
<td><p>7170.04</p></td>
|
|
<td><p></p></td>
|
|
<td><p>3629.98</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>128, 4096</p></td>
|
|
<td><p>17564.49</p></td>
|
|
<td><p>7020.49</p></td>
|
|
<td><p>5052.47</p></td>
|
|
<td><p></p></td>
|
|
<td><p>2933.79</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>500, 2000</p></td>
|
|
<td><p>21498.7</p></td>
|
|
<td><p>10606.93</p></td>
|
|
<td><p>6151.81</p></td>
|
|
<td><p></p></td>
|
|
<td><p>2959.66</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>1000, 1000</p></td>
|
|
<td><p>16383.52</p></td>
|
|
<td><p>9803.47</p></td>
|
|
<td><p>4790.88</p></td>
|
|
<td><p></p></td>
|
|
<td><p>2146.74</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 128</p></td>
|
|
<td><p>2945.44</p></td>
|
|
<td><p>2028.84</p></td>
|
|
<td><p>827.34</p></td>
|
|
<td><p></p></td>
|
|
<td><p>291.53</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>2048, 2048</p></td>
|
|
<td><p>11238.84</p></td>
|
|
<td><p>5804.75</p></td>
|
|
<td><p>3395</p></td>
|
|
<td><p></p></td>
|
|
<td><p>1830.44</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>5000, 500</p></td>
|
|
<td><p>3755.98</p></td>
|
|
<td><p>2281.8</p></td>
|
|
<td><p>1032.41</p></td>
|
|
<td><p></p></td>
|
|
<td><p>417.12</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>20000, 2000</p></td>
|
|
<td><p>2151.07</p></td>
|
|
<td><p>1186.32</p></td>
|
|
<td><p>597.81</p></td>
|
|
<td><p></p></td>
|
|
<td><p>323.37</p></td>
|
|
<td><p></p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<p><em>TP stands for Tensor Parallelism</em></p>
|
|
</section>
|
|
<section id="reproducing-benchmarked-results">
|
|
<h2>Reproducing Benchmarked Results<a class="headerlink" href="#reproducing-benchmarked-results" title="Link to this heading"></a></h2>
|
|
<blockquote>
|
|
<div><p>[!NOTE] The only models supported in this workflow are those listed in the table above.</p>
|
|
</div></blockquote>
|
|
<p>The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
|
|
description of this benchmarking workflow, see the <a class="reference external" href="https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html">benchmarking suite documentation</a>.</p>
|
|
<section id="commands">
|
|
<h3>Commands<a class="headerlink" href="#commands" title="Link to this heading"></a></h3>
|
|
<section id="for-non-gh200-systems">
|
|
<h4>For non GH200 systems<a class="headerlink" href="#for-non-gh200-systems" title="Link to this heading"></a></h4>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head text-left"><p>Stage</p></th>
|
|
<th class="head"><p>Description</p></th>
|
|
<th class="head"><p>Command</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#preparing-a-dataset">Dataset</a></p></td>
|
|
<td><p>Create a synthetic dataset</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">benchmarks/cpp/prepare_dataset.py</span> <span class="pre">--tokenizer=$model_name</span> <span class="pre">--stdout</span> <span class="pre">token-norm-dist</span> <span class="pre">--num-requests=$num_requests</span> <span class="pre">--input-mean=$isl</span> <span class="pre">--output-mean=$osl</span> <span class="pre">--input-stdev=0</span> <span class="pre">--output-stdev=0</span> <span class="pre">></span> <span class="pre">$dataset_file</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><a class="reference internal" href="#engine-building">Build</a></p></td>
|
|
<td><p>Build a TensorRT-LLM engine</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">--model</span> <span class="pre">$model_name</span> <span class="pre">build</span> <span class="pre">--tp_size</span> <span class="pre">$tp_size</span> <span class="pre">--pp_size</span> <span class="pre">$pp_size</span> <span class="pre">--quantization</span> <span class="pre">FP8</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#running-the-benchmark">Run</a></p></td>
|
|
<td><p>Run a benchmark with a dataset</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">--model</span> <span class="pre">$model_name</span> <span class="pre">throughput</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span> <span class="pre">--engine_dir</span> <span class="pre">$engine_dir</span></code></p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
<section id="for-gh200-systems-only">
|
|
<h4>For GH200 systems only<a class="headerlink" href="#for-gh200-systems-only" title="Link to this heading"></a></h4>
|
|
<p>For release v0.15, on GH200 systems, the recommendation is to use the legacy flow based on <em>gptManagerBenchmark</em> to measure performance.</p>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head text-left"><p>Stage</p></th>
|
|
<th class="head"><p>Description</p></th>
|
|
<th class="head"><p>Command</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#preparing-a-dataset">Dataset</a></p></td>
|
|
<td><p>Create a synthetic dataset for engine building</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">benchmarks/cpp/prepare_dataset.py</span> <span class="pre">--tokenizer=$model_name</span> <span class="pre">--stdout</span> <span class="pre">token-norm-dist</span> <span class="pre">--num-requests=$num_requests</span> <span class="pre">--input-mean=$isl</span> <span class="pre">--output-mean=$osl</span> <span class="pre">--input-stdev=0</span> <span class="pre">--output-stdev=0</span> <span class="pre">></span> <span class="pre">$dataset_file</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><a class="reference internal" href="#engine-building">Build</a></p></td>
|
|
<td><p>Build a TensorRT-LLM engine</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">--model</span> <span class="pre">$model_name</span> <span class="pre">build</span> <span class="pre">--tp_size</span> <span class="pre">$tp_size</span> <span class="pre">--quantization</span> <span class="pre">FP8</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><a class="reference internal" href="#preparing-a-dataset">Dataset</a></p></td>
|
|
<td><p>Create a synthetic dataset for benchmarking in json format</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">python</span> <span class="pre">benchmarks/cpp/prepare_dataset.py</span> <span class="pre">--output=$dataset_file_json</span> <span class="pre">--tokenizer=$model_name</span> <span class="pre">token-norm-dist</span> <span class="pre">--num-requests=$num_requests</span> <span class="pre">--input-mean=$isl</span> <span class="pre">--output-mean=$osl</span> <span class="pre">--input-stdev=0</span> <span class="pre">--output-stdev=0</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><a class="reference internal" href="#running-the-benchmark">Run</a></p></td>
|
|
<td><p>Run a benchmark with a dataset in json format</p></td>
|
|
<td><p><code class="docutils literal notranslate"><span class="pre">/app/tensorrt_llm/benchmarks/cpp/gptManagerBenchmark</span> <span class="pre">--engine_dir</span> <span class="pre">$engine_dir</span> <span class="pre">--type</span> <span class="pre">IFB</span> <span class="pre">--api</span> <span class="pre">executor</span> <span class="pre">--dataset</span> <span class="pre">$dataset_file_json</span> <span class="pre">--eos_id</span> <span class="pre">-1</span> <span class="pre">--log_iteration_data</span> <span class="pre">--scheduler_policy</span> <span class="pre">guaranteed_no_evict</span> <span class="pre">--kv_cache_free_gpu_mem_fraction</span> <span class="pre">0.95</span> <span class="pre">--output_csv</span> <span class="pre">result.csv</span> <span class="pre">--request_rate</span> <span class="pre">-1.0</span> <span class="pre">--enable_chunked_context</span> <span class="pre">--warm_up</span> <span class="pre">0</span></code></p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
</section>
|
|
<section id="variables">
|
|
<h3>Variables<a class="headerlink" href="#variables" title="Link to this heading"></a></h3>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head text-left"><p>Name</p></th>
|
|
<th class="head"><p>Description</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$isl</span></code></p></td>
|
|
<td><p>Benchmark input sequence length.</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$osl</span></code></p></td>
|
|
<td><p>Benchmark output sequence length.</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$tp_size</span></code></p></td>
|
|
<td><p>Tensor parallel mapping degree to run the benchmark with</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$pp_size</span></code></p></td>
|
|
<td><p>Pipeline parallel mapping degree to run the benchmark with</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$engine_dir</span></code></p></td>
|
|
<td><p>Location to store built engine file (can be deleted after running benchmarks).</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$model_name</span></code></p></td>
|
|
<td><p>HuggingFace model name eg. meta-llama/Llama-2-7b-hf or use the path to a local weights directory</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code></p></td>
|
|
<td><p>Location of the dataset file generated by <code class="docutils literal notranslate"><span class="pre">prepare_dataset.py</span></code></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$num_requests</span></code></p></td>
|
|
<td><p>The number of requests to generate for dataset generation</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td class="text-left"><p><code class="docutils literal notranslate"><span class="pre">$seq_len</span></code></p></td>
|
|
<td><p>A sequence length of ISL + OSL</p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
<section id="preparing-a-dataset">
|
|
<h3>Preparing a Dataset<a class="headerlink" href="#preparing-a-dataset" title="Link to this heading"></a></h3>
|
|
<p>In order to prepare a dataset, you can use the provided <a class="reference download internal" download="" href="../_downloads/ea8faa5e98124e92f96b66dc586fb429/prepare_dataset.py"><span class="xref download myst">script</span></a>.
|
|
To generate a synthetic dataset, run the following command:</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>benchmarks/cpp/prepare_dataset.py<span class="w"> </span>--tokenizer<span class="o">=</span><span class="nv">$model_name</span><span class="w"> </span>--stdout<span class="w"> </span>token-norm-dist<span class="w"> </span>--num-requests<span class="o">=</span><span class="nv">$num_requests</span><span class="w"> </span>--input-mean<span class="o">=</span><span class="nv">$isl</span><span class="w"> </span>--output-mean<span class="o">=</span><span class="nv">$osl</span><span class="w"> </span>--input-stdev<span class="o">=</span><span class="m">0</span><span class="w"> </span>--output-stdev<span class="o">=</span><span class="m">0</span><span class="w"> </span>><span class="w"> </span><span class="nv">$dataset_file</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The command will generate a text file located at the path specified <code class="docutils literal notranslate"><span class="pre">$dataset_file</span></code> where all requests are of the same
|
|
input/output sequence length combinations. The script works by using the tokenizer to retrieve the vocabulary size and
|
|
randomly sample token IDs from it to create entirely random sequences. In the command above, all requests will be uniform
|
|
because the standard deviations for both input and output sequences are set to 0.</p>
|
|
<p>For each input and output sequence length combination, the table below details the <code class="docutils literal notranslate"><span class="pre">$num_requests</span></code> that were used. For
|
|
shorter input and output lengths, a larger number of messages were used to guarantee that the system hit a steady state
|
|
because requests enter and exit the system at a much faster rate. For longer input/output sequence lengths, requests
|
|
remain in the system longer and therefore require less requests to achieve steady state.</p>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head"><p>Input Length</p></th>
|
|
<th class="head"><p>Output Length</p></th>
|
|
<th class="head"><p>$seq_len</p></th>
|
|
<th class="head"><p>$num_requests</p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td><p>128</p></td>
|
|
<td><p>128</p></td>
|
|
<td><p>256</p></td>
|
|
<td><p>30000</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>128</p></td>
|
|
<td><p>2048</p></td>
|
|
<td><p>2176</p></td>
|
|
<td><p>3000</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>128</p></td>
|
|
<td><p>4096</p></td>
|
|
<td><p>4224</p></td>
|
|
<td><p>1500</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>2048</p></td>
|
|
<td><p>128</p></td>
|
|
<td><p>2176</p></td>
|
|
<td><p>3000</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>2048</p></td>
|
|
<td><p>2048</p></td>
|
|
<td><p>4096</p></td>
|
|
<td><p>1500</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>5000</p></td>
|
|
<td><p>500</p></td>
|
|
<td><p>5500</p></td>
|
|
<td><p>1500</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>1000</p></td>
|
|
<td><p>1000</p></td>
|
|
<td><p>2000</p></td>
|
|
<td><p>3000</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>500</p></td>
|
|
<td><p>2000</p></td>
|
|
<td><p>2500</p></td>
|
|
<td><p>3000</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p>20000</p></td>
|
|
<td><p>2000</p></td>
|
|
<td><p>22000</p></td>
|
|
<td><p>1000</p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</section>
|
|
<section id="engine-building">
|
|
<h3>Engine Building<a class="headerlink" href="#engine-building" title="Link to this heading"></a></h3>
|
|
<p>All engines are built using the <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">build</span></code> subcommand.
|
|
The basic command for FP8 quantized engines is as follows:</p>
|
|
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>trtllm-bench --model $model_name build --tp_size $tp_size --pp_size $pp_size --quantization FP8 --dataset $dataset_file
|
|
</pre></div>
|
|
</div>
|
|
<p>When providing <code class="docutils literal notranslate"><span class="pre">--dataset</span></code> in the build subcommand, <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">build</span></code> uses high-level statistics of the dataset (average ISL/OSL, max sequence length) and tuning heuristics to optimize engine build settings.</p>
|
|
<p>Alternatively, if you would like to build the engine with specific settings, you can do so by specifying the values for <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> and <code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code>:</p>
|
|
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>trtllm-bench --model $model_name build --tp_size $tp_size --pp_size $pp_size --quantization FP8 --max_seq_len $seq_len --max_batch_size $max_bs --max_num_tokens $max_token
|
|
</pre></div>
|
|
</div>
|
|
<p>If you would like to build an FP16 engine without any quantization, simply remove the <code class="docutils literal notranslate"><span class="pre">--quantization</span> <span class="pre">FP8</span></code> option.</p>
|
|
<blockquote>
|
|
<div><p>[!NOTE] If you specify FP8 quantization, the KV cache will automatically be set to FP8 as well!</p>
|
|
</div></blockquote>
|
|
<p>The <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">build</span></code> subcommand will output the path where the engine is located upon a successful build. For example,</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="o">===========================================================</span>
|
|
ENGINE<span class="w"> </span>SAVED:<span class="w"> </span>/tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
|
<span class="o">===========================================================</span>
|
|
</pre></div>
|
|
</div>
|
|
</section>
|
|
<section id="running-the-benchmark">
|
|
<h3>Running the Benchmark<a class="headerlink" href="#running-the-benchmark" title="Link to this heading"></a></h3>
|
|
</section>
|
|
<section id="id1">
|
|
<h3>For non GH200 systems<a class="headerlink" href="#id1" title="Link to this heading"></a></h3>
|
|
<p>To run the benchmark with the generated data set, simply use the <code class="docutils literal notranslate"><span class="pre">trtllm-bench</span> <span class="pre">throughput</span></code> subcommand. The benchmarker will
|
|
run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide
|
|
the patch to the engine from the <a class="reference internal" href="#engine-building">build</a> phase and a <a class="reference internal" href="#preparing-a-dataset">generated dataset</a>.</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>trtllm-bench<span class="w"> </span>--model<span class="w"> </span><span class="nv">$model_name</span><span class="w"> </span>throughput<span class="w"> </span>--dataset<span class="w"> </span><span class="nv">$dataset_file</span><span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_dir</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>In majority of cases, we also use a higher KV cache percentage by setting <code class="docutils literal notranslate"><span class="pre">--kv_cache_free_gpu_mem_fraction</span> <span class="pre">0.95</span></code> in the benchmark command. This allows us to obtain better performance than the default setting of <code class="docutils literal notranslate"><span class="pre">0.90</span></code>. We fall back to <code class="docutils literal notranslate"><span class="pre">0.90</span></code> if we hit an out of memory issue.</p>
|
|
<p>The results will be printed to the terminal upon benchmark completion. For example,</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="o">===========================================================</span>
|
|
<span class="o">=</span><span class="w"> </span>ENGINE<span class="w"> </span><span class="nv">DETAILS</span>
|
|
<span class="o">===========================================================</span>
|
|
Model:<span class="w"> </span>meta-llama/Llama-2-7b-hf
|
|
Engine<span class="w"> </span>Directory:<span class="w"> </span>/tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
|
TensorRT-LLM<span class="w"> </span>Version:<span class="w"> </span><span class="m">0</span>.12.0
|
|
Dtype:<span class="w"> </span>float16
|
|
KV<span class="w"> </span>Cache<span class="w"> </span>Dtype:<span class="w"> </span>FP8
|
|
Quantization:<span class="w"> </span>FP8
|
|
Max<span class="w"> </span>Input<span class="w"> </span>Length:<span class="w"> </span><span class="m">2048</span>
|
|
Max<span class="w"> </span>Sequence<span class="w"> </span>Length:<span class="w"> </span><span class="nv">4098</span>
|
|
|
|
<span class="o">===========================================================</span>
|
|
<span class="o">=</span><span class="w"> </span>WORLD<span class="w"> </span>+<span class="w"> </span>RUNTIME<span class="w"> </span><span class="nv">INFORMATION</span>
|
|
<span class="o">===========================================================</span>
|
|
TP<span class="w"> </span>Size:<span class="w"> </span><span class="m">1</span>
|
|
PP<span class="w"> </span>Size:<span class="w"> </span><span class="m">1</span>
|
|
Max<span class="w"> </span>Runtime<span class="w"> </span>Batch<span class="w"> </span>Size:<span class="w"> </span><span class="m">4096</span>
|
|
Max<span class="w"> </span>Runtime<span class="w"> </span>Tokens:<span class="w"> </span><span class="m">8192</span>
|
|
Scheduling<span class="w"> </span>Policy:<span class="w"> </span>Guaranteed<span class="w"> </span>No<span class="w"> </span>Evict
|
|
KV<span class="w"> </span>Memory<span class="w"> </span>Percentage:<span class="w"> </span><span class="m">99</span>.0%
|
|
Issue<span class="w"> </span>Rate<span class="w"> </span><span class="o">(</span>req/sec<span class="o">)</span>:<span class="w"> </span><span class="m">3</span>.680275266452667e+18
|
|
<span class="o">===========================================================</span>
|
|
<span class="o">=</span><span class="w"> </span><span class="nv">STATISTICS</span>
|
|
<span class="o">===========================================================</span>
|
|
Number<span class="w"> </span>of<span class="w"> </span>requests:<span class="w"> </span><span class="m">3000</span>
|
|
Average<span class="w"> </span>Input<span class="w"> </span>Length<span class="w"> </span><span class="o">(</span>tokens<span class="o">)</span>:<span class="w"> </span><span class="m">128</span>.0
|
|
Average<span class="w"> </span>Output<span class="w"> </span>Length<span class="w"> </span><span class="o">(</span>tokens<span class="o">)</span>:<span class="w"> </span><span class="m">128</span>.0
|
|
Token<span class="w"> </span>Throughput<span class="w"> </span><span class="o">(</span>tokens/sec<span class="o">)</span>:<span class="w"> </span><span class="m">23405</span>.927228471104
|
|
Request<span class="w"> </span>Throughput<span class="w"> </span><span class="o">(</span>req/sec<span class="o">)</span>:<span class="w"> </span><span class="m">182</span>.8588064724305
|
|
Total<span class="w"> </span>Latency<span class="w"> </span><span class="o">(</span>seconds<span class="o">)</span>:<span class="w"> </span><span class="m">16</span>.406100739
|
|
<span class="o">===========================================================</span>
|
|
</pre></div>
|
|
</div>
|
|
<blockquote>
|
|
<div><p>[!WARNING] In some cases, the benchmarker may not print anything at all. This behavior usually
|
|
means that the benchmark has hit an out of memory issue. Try reducing the KV cache percentage
|
|
using the <code class="docutils literal notranslate"><span class="pre">--kv_cache_free_gpu_mem_fraction</span></code> option to lower the percentage of used memory.</p>
|
|
</div></blockquote>
|
|
</section>
|
|
</section>
|
|
<section id="online-serving-measurements">
|
|
<h2>Online Serving Measurements<a class="headerlink" href="#online-serving-measurements" title="Link to this heading"></a></h2>
|
|
<p>The <a class="reference external" href="https://github.com/triton-inference-server/tensorrtllm_backend">TensorRT-LLM backend</a> is used to measure the performance of TensorRT-LLM for online serving.</p>
|
|
<p>The below table shows the throughput and latency under a serving scenario.</p>
|
|
<p><strong>All data in the table below was generated using version 0.14.0, with 500 requests and BF16 precision.</strong></p>
|
|
<table class="docutils align-default">
|
|
<thead>
|
|
<tr class="row-odd"><th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
<th class="head"><p></p></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr class="row-even"><td><p><strong>Model</strong></p></td>
|
|
<td><p><strong>GPU</strong></p></td>
|
|
<td><p><strong>TP</strong></p></td>
|
|
<td><p><strong>Input Length</strong></p></td>
|
|
<td><p><strong>Output Length</strong></p></td>
|
|
<td><p><strong>QPS</strong></p></td>
|
|
<td><p><strong>Tput(req/s)</strong></p></td>
|
|
<td><p><strong>Mean TTFT(ms)</strong></p></td>
|
|
<td><p><strong>Mean ITL(ms)</strong></p></td>
|
|
<td><p><strong>Total Token Tput (tok/s)</strong></p></td>
|
|
<td><p><strong>Output Tput (tok/s)</strong></p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA 3.1 70B</p></td>
|
|
<td><p>H100 80GB HBM3</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>256</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>62</p></td>
|
|
<td><p>21</p></td>
|
|
<td><p>1406</p></td>
|
|
<td><p>498</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>68</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>2750</p></td>
|
|
<td><p>973</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>92</p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>5256</p></td>
|
|
<td><p>1860</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>12</p></td>
|
|
<td><p>175</p></td>
|
|
<td><p>66</p></td>
|
|
<td><p>8941</p></td>
|
|
<td><p>3164</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>1229</p></td>
|
|
<td><p>86</p></td>
|
|
<td><p>11537</p></td>
|
|
<td><p>4083</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>9123</p></td>
|
|
<td><p>85</p></td>
|
|
<td><p>11593</p></td>
|
|
<td><p>4103</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>53</p></td>
|
|
<td><p>18</p></td>
|
|
<td><p>844</p></td>
|
|
<td><p>28</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>58</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>1908</p></td>
|
|
<td><p>63</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>71</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>3795</p></td>
|
|
<td><p>126</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>109</p></td>
|
|
<td><p>38</p></td>
|
|
<td><p>7492</p></td>
|
|
<td><p>248</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>28</p></td>
|
|
<td><p>1197</p></td>
|
|
<td><p>482</p></td>
|
|
<td><p>13655</p></td>
|
|
<td><p>452</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>28</p></td>
|
|
<td><p>9126</p></td>
|
|
<td><p>548</p></td>
|
|
<td><p>13719</p></td>
|
|
<td><p>454</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>202</p></td>
|
|
<td><p>214</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>48</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>780</p></td>
|
|
<td><p>401</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>51</p></td>
|
|
<td><p>22</p></td>
|
|
<td><p>1499</p></td>
|
|
<td><p>771</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>57</p></td>
|
|
<td><p>25</p></td>
|
|
<td><p>2702</p></td>
|
|
<td><p>1390</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>11</p></td>
|
|
<td><p>74</p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>4364</p></td>
|
|
<td><p>2245</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>14</p></td>
|
|
<td><p>116</p></td>
|
|
<td><p>42</p></td>
|
|
<td><p>5837</p></td>
|
|
<td><p>3003</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>4482</p></td>
|
|
<td><p>50</p></td>
|
|
<td><p>6725</p></td>
|
|
<td><p>3459</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA 3.1 8B</p></td>
|
|
<td><p></p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>256</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>23</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>1423</p></td>
|
|
<td><p>504</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>9</p></td>
|
|
<td><p>2624</p></td>
|
|
<td><p>929</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>26</p></td>
|
|
<td><p>9</p></td>
|
|
<td><p>5535</p></td>
|
|
<td><p>1959</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>15</p></td>
|
|
<td><p>30</p></td>
|
|
<td><p>11</p></td>
|
|
<td><p>10636</p></td>
|
|
<td><p>3765</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>26</p></td>
|
|
<td><p>50</p></td>
|
|
<td><p>19</p></td>
|
|
<td><p>19138</p></td>
|
|
<td><p>6774</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>37</p></td>
|
|
<td><p>3335</p></td>
|
|
<td><p>39</p></td>
|
|
<td><p>26614</p></td>
|
|
<td><p>9420</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>19</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>956</p></td>
|
|
<td><p>32</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>1910</p></td>
|
|
<td><p>63</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>22</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>3808</p></td>
|
|
<td><p>126</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7567</p></td>
|
|
<td><p>251</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>31</p></td>
|
|
<td><p>29</p></td>
|
|
<td><p>10</p></td>
|
|
<td><p>14894</p></td>
|
|
<td><p>493</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>79</p></td>
|
|
<td><p>3280</p></td>
|
|
<td><p>193</p></td>
|
|
<td><p>38319</p></td>
|
|
<td><p>1269</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>202</p></td>
|
|
<td><p>214</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>19</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>809</p></td>
|
|
<td><p>416</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>1586</p></td>
|
|
<td><p>816</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>21</p></td>
|
|
<td><p>9</p></td>
|
|
<td><p>3047</p></td>
|
|
<td><p>1568</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>13</p></td>
|
|
<td><p>23</p></td>
|
|
<td><p>10</p></td>
|
|
<td><p>5597</p></td>
|
|
<td><p>2879</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>23</p></td>
|
|
<td><p>27</p></td>
|
|
<td><p>11</p></td>
|
|
<td><p>9381</p></td>
|
|
<td><p>4825</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>39</p></td>
|
|
<td><p>1657</p></td>
|
|
<td><p>21</p></td>
|
|
<td><p>16117</p></td>
|
|
<td><p>8291</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA 3.1 70B</p></td>
|
|
<td><p>H200 131GB HBM3</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>256</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>58</p></td>
|
|
<td><p>18</p></td>
|
|
<td><p>1411</p></td>
|
|
<td><p>499</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>63</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>2770</p></td>
|
|
<td><p>980</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>84</p></td>
|
|
<td><p>27</p></td>
|
|
<td><p>5328</p></td>
|
|
<td><p>1886</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>13</p></td>
|
|
<td><p>165</p></td>
|
|
<td><p>60</p></td>
|
|
<td><p>9224</p></td>
|
|
<td><p>3264</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>1279</p></td>
|
|
<td><p>83</p></td>
|
|
<td><p>11800</p></td>
|
|
<td><p>4176</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>9222</p></td>
|
|
<td><p>83</p></td>
|
|
<td><p>11826</p></td>
|
|
<td><p>4185</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>50</p></td>
|
|
<td><p>15</p></td>
|
|
<td><p>956</p></td>
|
|
<td><p>32</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>55</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>1909</p></td>
|
|
<td><p>63</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>67</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>3799</p></td>
|
|
<td><p>126</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>103</p></td>
|
|
<td><p>33</p></td>
|
|
<td><p>7499</p></td>
|
|
<td><p>248</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>28</p></td>
|
|
<td><p>1259</p></td>
|
|
<td><p>485</p></td>
|
|
<td><p>13586</p></td>
|
|
<td><p>450</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>29</p></td>
|
|
<td><p>9074</p></td>
|
|
<td><p>546</p></td>
|
|
<td><p>13792</p></td>
|
|
<td><p>457</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>202</p></td>
|
|
<td><p>214</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>43</p></td>
|
|
<td><p>17</p></td>
|
|
<td><p>793</p></td>
|
|
<td><p>408</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>46</p></td>
|
|
<td><p>18</p></td>
|
|
<td><p>1524</p></td>
|
|
<td><p>784</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>51</p></td>
|
|
<td><p>21</p></td>
|
|
<td><p>2796</p></td>
|
|
<td><p>1438</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>11</p></td>
|
|
<td><p>67</p></td>
|
|
<td><p>28</p></td>
|
|
<td><p>4639</p></td>
|
|
<td><p>2386</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>15</p></td>
|
|
<td><p>112</p></td>
|
|
<td><p>39</p></td>
|
|
<td><p>6288</p></td>
|
|
<td><p>3235</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>17</p></td>
|
|
<td><p>4480</p></td>
|
|
<td><p>48</p></td>
|
|
<td><p>7230</p></td>
|
|
<td><p>3719</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p>LLaMA 3.1 8B</p></td>
|
|
<td><p>H200 131GB HBM3</p></td>
|
|
<td><p>1</p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>256</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>21</p></td>
|
|
<td><p>6</p></td>
|
|
<td><p>1425</p></td>
|
|
<td><p>504</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>23</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>2828</p></td>
|
|
<td><p>1001</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>5567</p></td>
|
|
<td><p>1971</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>15</p></td>
|
|
<td><p>27</p></td>
|
|
<td><p>9</p></td>
|
|
<td><p>10761</p></td>
|
|
<td><p>3809</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>27</p></td>
|
|
<td><p>44</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>19848</p></td>
|
|
<td><p>7025</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>40</p></td>
|
|
<td><p>3237</p></td>
|
|
<td><p>36</p></td>
|
|
<td><p>28596</p></td>
|
|
<td><p>10121</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>467</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>18</p></td>
|
|
<td><p>5</p></td>
|
|
<td><p>956</p></td>
|
|
<td><p>32</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>19</p></td>
|
|
<td><p>6</p></td>
|
|
<td><p>1910</p></td>
|
|
<td><p>63</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>6</p></td>
|
|
<td><p>3810</p></td>
|
|
<td><p>126</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>22</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>7567</p></td>
|
|
<td><p>250</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>31</p></td>
|
|
<td><p>27</p></td>
|
|
<td><p>9</p></td>
|
|
<td><p>14927</p></td>
|
|
<td><p>494</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>81</p></td>
|
|
<td><p>3227</p></td>
|
|
<td><p>190</p></td>
|
|
<td><p>39007</p></td>
|
|
<td><p>1291</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>202</p></td>
|
|
<td><p>214</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>2</p></td>
|
|
<td><p>17</p></td>
|
|
<td><p>6</p></td>
|
|
<td><p>812</p></td>
|
|
<td><p>418</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>4</p></td>
|
|
<td><p>18</p></td>
|
|
<td><p>6</p></td>
|
|
<td><p>1597</p></td>
|
|
<td><p>822</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>19</p></td>
|
|
<td><p>7</p></td>
|
|
<td><p>3088</p></td>
|
|
<td><p>1589</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>16</p></td>
|
|
<td><p>14</p></td>
|
|
<td><p>20</p></td>
|
|
<td><p>8</p></td>
|
|
<td><p>5771</p></td>
|
|
<td><p>2969</p></td>
|
|
</tr>
|
|
<tr class="row-odd"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>32</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>24</p></td>
|
|
<td><p>9</p></td>
|
|
<td><p>9931</p></td>
|
|
<td><p>5109</p></td>
|
|
</tr>
|
|
<tr class="row-even"><td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p></p></td>
|
|
<td><p>INF</p></td>
|
|
<td><p>43</p></td>
|
|
<td><p>1665</p></td>
|
|
<td><p>19</p></td>
|
|
<td><p>17861</p></td>
|
|
<td><p>9189</p></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<p><em>TP stands for Tensor Parallelism</em></p>
|
|
<p><em>TTFT stands for Time To First Token</em></p>
|
|
<p><em>ITL stands for Inter Token Latency</em></p>
|
|
<section id="id2">
|
|
<h3>For GH200 systems only<a class="headerlink" href="#id2" title="Link to this heading"></a></h3>
|
|
<p>For release v0.15, on GH200 systems, the recommendation is to use <em>gptManagerBenchmark</em> to measure performance. Throughput measurements are reported based on the below commands.</p>
|
|
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span><span class="w"> </span>/app/tensorrt_llm/benchmarks/cpp/gptManagerBenchmark<span class="w"> </span>--engine_dir<span class="w"> </span><span class="nv">$engine_dir</span><span class="w"> </span>--type<span class="w"> </span>IFB<span class="w"> </span>--dataset<span class="w"> </span><span class="nv">$dataset_file_json</span><span class="w"> </span>--eos_id<span class="w"> </span>-1<span class="w"> </span>--scheduler_policy<span class="w"> </span>guaranteed_no_evict<span class="w"> </span>--kv_cache_free_gpu_mem_fraction<span class="w"> </span><span class="m">0</span>.95<span class="w"> </span>--output_csv<span class="w"> </span>result.csv<span class="w"> </span>--request_rate<span class="w"> </span>-1.0<span class="w"> </span>--enable_chunked_context<span class="w"> </span>--warm_up<span class="w"> </span><span class="m">0</span>
|
|
</pre></div>
|
|
</div>
|
|
<blockquote>
|
|
<div><p>[!Warning] CUDA error: out of memory <br />
|
|
For benchmarks with large models causing OOM error, the command above must be modified to use <code class="docutils literal notranslate"><span class="pre">--kv_cache_free_gpu_mem_fraction</span> <span class="pre">0.90</span></code> to avoid the scenario.</p>
|
|
</div></blockquote>
|
|
<p>The command will run the <code class="docutils literal notranslate"><span class="pre">gptManagerBenchmark</span></code> binary that will report the throughput and other metrics as part of its output
|
|
that can be compared with the table in the <a class="reference internal" href="#throughput-measurements">Throughput Measurements</a> of this README.</p>
|
|
</section>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
|
<a href="../advanced/disaggregated-service.html" class="btn btn-neutral float-left" title="Disaggregated-Service (experimental)" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
|
<a href="perf-benchmarking.html" class="btn btn-neutral float-right" title="TensorRT-LLM Benchmarking" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
|
</div>
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<jinja2.runtime.BlockReference object at 0x7da7afeda9f0>
|
|
|
|
<div class="footer">
|
|
<p>
|
|
Copyright © 2024 NVIDIA Corporation
|
|
</p>
|
|
<p>
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Privacy Policy</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Manage My Privacy</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
|
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Accessibility</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
|
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Product Security</a> |
|
|
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
|
data-cms-ai="0">Contact</a>
|
|
</p>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
</footer>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
</div>
|
|
<script>
|
|
jQuery(function () {
|
|
SphinxRtdTheme.Navigation.enable(true);
|
|
});
|
|
</script>
|
|
|
|
</body>
|
|
</html> |