TensorRT-LLMs/commands/trtllm-build.html
2024-12-04 14:25:18 +08:00

579 lines
52 KiB
HTML

<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>trtllm-build &mdash; tensorrt_llm documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=65e89d2a"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="trtllm-serve" href="trtllm-serve.html" />
<link rel="prev" title="Runtime" href="../_cpp_gen/runtime.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
tensorrt_llm
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="current reference internal" href="#">trtllm-build</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.commands.build-parse_arguments-named-arguments">Named Arguments</a></li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.commands.build-parse_arguments-logits-arguments">Logits arguments</a></li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.commands.build-parse_arguments-lora-arguments">LoRA arguments</a></li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.commands.build-parse_arguments-speculative-decoding-arguments">Speculative decoding arguments</a></li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.commands.build-parse_arguments-auto-parallel-arguments">Auto parallel arguments</a></li>
<li class="toctree-l2"><a class="reference internal" href="#tensorrt_llm.commands.build-parse_arguments-plugin-config-arguments">Plugin config arguments</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="trtllm-serve.html">trtllm-serve</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-best-practices.html">Best Practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">tensorrt_llm</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item active">trtllm-build</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/commands/trtllm-build.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="trtllm-build">
<h1>trtllm-build<a class="headerlink" href="#trtllm-build" title="Link to this heading"></a></h1>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">usage</span><span class="p">:</span> <span class="n">trtllm</span><span class="o">-</span><span class="n">build</span> <span class="p">[</span><span class="o">-</span><span class="n">h</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">checkpoint_dir</span> <span class="n">CHECKPOINT_DIR</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">model_config</span> <span class="n">MODEL_CONFIG</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">build_config</span> <span class="n">BUILD_CONFIG</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">model_cls_file</span> <span class="n">MODEL_CLS_FILE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_name</span> <span class="n">MODEL_CLS_NAME</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">output_dir</span> <span class="n">OUTPUT_DIR</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">max_batch_size</span> <span class="n">MAX_BATCH_SIZE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_input_len</span> <span class="n">MAX_INPUT_LEN</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">max_seq_len</span> <span class="n">MAX_SEQ_LEN</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">max_beam_width</span> <span class="n">MAX_BEAM_WIDTH</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_num_tokens</span> <span class="n">MAX_NUM_TOKENS</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">opt_num_tokens</span> <span class="n">OPT_NUM_TOKENS</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_encoder_input_len</span> <span class="n">MAX_ENCODER_INPUT_LEN</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_prompt_embedding_table_size</span> <span class="n">MAX_PROMPT_EMBEDDING_TABLE_SIZE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">kv_cache_type</span> <span class="n">KV_CACHE_TYPE</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">paged_kv_cache</span> <span class="n">PAGED_KV_CACHE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">input_timing_cache</span> <span class="n">INPUT_TIMING_CACHE</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">output_timing_cache</span> <span class="n">OUTPUT_TIMING_CACHE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">profiling_verbosity</span> <span class="p">{</span><span class="n">layer_names_only</span><span class="p">,</span><span class="n">detailed</span><span class="p">,</span><span class="n">none</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">strip_plan</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">weight_sparsity</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">weight_streaming</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">fast_build</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">workers</span> <span class="n">WORKERS</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">log_level</span> <span class="p">{</span><span class="n">internal_error</span><span class="p">,</span><span class="n">error</span><span class="p">,</span><span class="n">warning</span><span class="p">,</span><span class="n">info</span><span class="p">,</span><span class="n">verbose</span><span class="p">,</span><span class="n">debug</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">enable_debug_output</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">visualize_network</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">dry_run</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">monitor_memory</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">logits_dtype</span> <span class="p">{</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">gather_context_logits</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">gather_generation_logits</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">gather_all_token_logits</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">lora_dir</span> <span class="n">LORA_DIR</span> <span class="p">[</span><span class="n">LORA_DIR</span> <span class="o">...</span><span class="p">]]</span> <span class="p">[</span><span class="o">--</span><span class="n">lora_ckpt_source</span> <span class="p">{</span><span class="n">hf</span><span class="p">,</span><span class="n">nemo</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">lora_target_modules</span> <span class="p">{</span><span class="n">attn_qkv</span><span class="p">,</span><span class="n">attn_q</span><span class="p">,</span><span class="n">attn_k</span><span class="p">,</span><span class="n">attn_v</span><span class="p">,</span><span class="n">attn_dense</span><span class="p">,</span><span class="n">mlp_h_to_4h</span><span class="p">,</span><span class="n">mlp_4h_to_h</span><span class="p">,</span><span class="n">mlp_gate</span><span class="p">,</span><span class="n">cross_attn_qkv</span><span class="p">,</span><span class="n">cross_attn_q</span><span class="p">,</span><span class="n">cross_attn_k</span><span class="p">,</span><span class="n">cross_attn_v</span><span class="p">,</span><span class="n">cross_attn_dense</span><span class="p">,</span><span class="n">moe_h_to_4h</span><span class="p">,</span><span class="n">moe_4h_to_h</span><span class="p">,</span><span class="n">moe_gate</span><span class="p">,</span><span class="n">moe_router</span><span class="p">,</span><span class="n">mlp_router</span><span class="p">}</span> <span class="p">[{</span><span class="n">attn_qkv</span><span class="p">,</span><span class="n">attn_q</span><span class="p">,</span><span class="n">attn_k</span><span class="p">,</span><span class="n">attn_v</span><span class="p">,</span><span class="n">attn_dense</span><span class="p">,</span><span class="n">mlp_h_to_4h</span><span class="p">,</span><span class="n">mlp_4h_to_h</span><span class="p">,</span><span class="n">mlp_gate</span><span class="p">,</span><span class="n">cross_attn_qkv</span><span class="p">,</span><span class="n">cross_attn_q</span><span class="p">,</span><span class="n">cross_attn_k</span><span class="p">,</span><span class="n">cross_attn_v</span><span class="p">,</span><span class="n">cross_attn_dense</span><span class="p">,</span><span class="n">moe_h_to_4h</span><span class="p">,</span><span class="n">moe_4h_to_h</span><span class="p">,</span><span class="n">moe_gate</span><span class="p">,</span><span class="n">moe_router</span><span class="p">,</span><span class="n">mlp_router</span><span class="p">}</span> <span class="o">...</span><span class="p">]]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_lora_rank</span> <span class="n">MAX_LORA_RANK</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">speculative_decoding_mode</span> <span class="p">{</span><span class="n">draft_tokens_external</span><span class="p">,</span><span class="n">lookahead_decoding</span><span class="p">,</span><span class="n">medusa</span><span class="p">,</span><span class="n">explicit_draft_tokens</span><span class="p">,</span><span class="n">eagle</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_draft_len</span> <span class="n">MAX_DRAFT_LEN</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">auto_parallel</span> <span class="n">AUTO_PARALLEL</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">gpus_per_node</span> <span class="n">GPUS_PER_NODE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">cluster_key</span> <span class="p">{</span><span class="n">A100</span><span class="o">-</span><span class="n">SXM</span><span class="o">-</span><span class="mi">80</span><span class="n">GB</span><span class="p">,</span><span class="n">A100</span><span class="o">-</span><span class="n">SXM</span><span class="o">-</span><span class="mi">40</span><span class="n">GB</span><span class="p">,</span><span class="n">A100</span><span class="o">-</span><span class="n">PCIe</span><span class="o">-</span><span class="mi">80</span><span class="n">GB</span><span class="p">,</span><span class="n">A100</span><span class="o">-</span><span class="n">PCIe</span><span class="o">-</span><span class="mi">40</span><span class="n">GB</span><span class="p">,</span><span class="n">H100</span><span class="o">-</span><span class="n">SXM</span><span class="p">,</span><span class="n">H100</span><span class="o">-</span><span class="n">PCIe</span><span class="p">,</span><span class="n">H20</span><span class="p">,</span><span class="n">V100</span><span class="o">-</span><span class="n">PCIe</span><span class="o">-</span><span class="mi">16</span><span class="n">GB</span><span class="p">,</span><span class="n">V100</span><span class="o">-</span><span class="n">PCIe</span><span class="o">-</span><span class="mi">32</span><span class="n">GB</span><span class="p">,</span><span class="n">V100</span><span class="o">-</span><span class="n">SXM</span><span class="o">-</span><span class="mi">16</span><span class="n">GB</span><span class="p">,</span><span class="n">V100</span><span class="o">-</span><span class="n">SXM</span><span class="o">-</span><span class="mi">32</span><span class="n">GB</span><span class="p">,</span><span class="n">V100S</span><span class="o">-</span><span class="n">PCIe</span><span class="p">,</span><span class="n">A40</span><span class="p">,</span><span class="n">A30</span><span class="p">,</span><span class="n">A10</span><span class="p">,</span><span class="n">A10G</span><span class="p">,</span><span class="n">L40S</span><span class="p">,</span><span class="n">L40</span><span class="p">,</span><span class="n">L20</span><span class="p">,</span><span class="n">L4</span><span class="p">,</span><span class="n">L2</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">bert_attention_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">gpt_attention_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">gemm_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">fp8</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">gemm_swiglu_plugin</span> <span class="p">{</span><span class="n">fp8</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">fp8_rowwise_gemm_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">nccl_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">lora_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">moe_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">mamba_conv1d_plugin</span> <span class="p">{</span><span class="n">auto</span><span class="p">,</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">,</span><span class="n">bfloat16</span><span class="p">,</span><span class="n">int32</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">low_latency_gemm_plugin</span> <span class="p">{</span><span class="n">fp8</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">low_latency_gemm_swiglu_plugin</span> <span class="p">{</span><span class="n">fp8</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">context_fmha</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">bert_context_fmha_fp32_acc</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">remove_input_padding</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">reduce_fusion</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">enable_xqa</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">tokens_per_block</span> <span class="n">TOKENS_PER_BLOCK</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">use_paged_context_fmha</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">use_fp8_context_fmha</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">multiple_profiles</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">paged_state</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">streamingllm</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span> <span class="p">[</span><span class="o">--</span><span class="n">use_fused_mlp</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
<span class="p">[</span><span class="o">--</span><span class="n">pp_reduce_scatter</span> <span class="p">{</span><span class="n">enable</span><span class="p">,</span><span class="n">disable</span><span class="p">}]</span>
</pre></div>
</div>
<section id="tensorrt_llm.commands.build-parse_arguments-named-arguments">
<h2>Named Arguments<a class="headerlink" href="#tensorrt_llm.commands.build-parse_arguments-named-arguments" title="Link to this heading"></a></h2>
<dl class="option-list">
<dt><kbd>--checkpoint_dir</kbd></dt>
<dd><p>The directory path that contains TensorRT-LLM checkpoint.</p>
</dd>
<dt><kbd>--model_config</kbd></dt>
<dd><p>The file path that saves TensorRT-LLM checkpoint config.</p>
</dd>
<dt><kbd>--build_config</kbd></dt>
<dd><p>The file path that saves TensorRT-LLM build config.</p>
</dd>
<dt><kbd>--model_cls_file</kbd></dt>
<dd><p>The file path that defines customized TensorRT-LLM model.</p>
</dd>
<dt><kbd>--model_cls_name</kbd></dt>
<dd><p>The customized TensorRT-LLM model class name.</p>
</dd>
<dt><kbd>--output_dir</kbd></dt>
<dd><p>The directory path to save the serialized engine files and engine config file.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'engine_outputs'</span></code></p>
</dd>
<dt><kbd>--max_batch_size</kbd></dt>
<dd><p>Maximum number of requests that the engine can schedule.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">2048</span></code></p>
</dd>
<dt><kbd>--max_input_len</kbd></dt>
<dd><p>Maximum input length of one request.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">1024</span></code></p>
</dd>
<dt><kbd>--max_seq_len, --max_decoder_seq_len</kbd></dt>
<dd><p>Maximum total length of one request, including prompt and outputs. If unspecified, the value is deduced from the model config.</p>
</dd>
<dt><kbd>--max_beam_width</kbd></dt>
<dd><p>Maximum number of beams for beam search decoding.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">1</span></code></p>
</dd>
<dt><kbd>--max_num_tokens</kbd></dt>
<dd><p>Maximum number of batched input tokens after padding is removed in each batch. Currently, the input padding is removed by default; you may explicitly disable it by specifying <code class="docutils literal notranslate"><span class="pre">--remove_input_padding</span> <span class="pre">disable</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">8192</span></code></p>
</dd>
<dt><kbd>--opt_num_tokens</kbd></dt>
<dd><p>Optimal number of batched input tokens after padding is removed in each batch It equals to <code class="docutils literal notranslate"><span class="pre">max_batch_size</span> <span class="pre">*</span> <span class="pre">max_beam_width</span></code> by default, set this value as close as possible to the actual number of tokens on your workload. Note that this argument might be removed in the future.</p>
</dd>
<dt><kbd>--max_encoder_input_len</kbd></dt>
<dd><p>Maximum encoder input length for enc-dec models. Set <code class="docutils literal notranslate"><span class="pre">max_input_len</span></code> to 1 to start generation from decoder_start_token_id of length 1.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">1024</span></code></p>
</dd>
<dt><kbd>--max_prompt_embedding_table_size, --max_multimodal_len</kbd></dt>
<dd><p>Maximum prompt embedding table size for prompt tuning, or maximum multimodal input size for multimodal models. Setting a value &gt; 0 enables prompt tuning or multimodal input.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">0</span></code></p>
</dd>
<dt><kbd>--kv_cache_type</kbd></dt>
<dd><p>Set KV cache type (continuous, paged, or disabled). For disabled case, KV cache is disabled and only context phase is allowed.</p>
</dd>
<dt><kbd>--paged_kv_cache</kbd></dt>
<dd><p>Deprecated. Enabling this option is equvilient to <code class="docutils literal notranslate"><span class="pre">--kv_cache_type</span> <span class="pre">paged</span></code> for transformer based models.</p>
</dd>
<dt><kbd>--input_timing_cache</kbd></dt>
<dd><p>The file path to read the timing cache. This option is ignored if the file does not exist.</p>
</dd>
<dt><kbd>--output_timing_cache</kbd></dt>
<dd><p>The file path to write the timing cache.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'model.cache'</span></code></p>
</dd>
<dt><kbd>--profiling_verbosity</kbd></dt>
<dd><p>Possible choices: layer_names_only, detailed, none</p>
<p>The profiling verbosity for the generated TensorRT engine. Setting to detailed allows inspecting tactic choices and kernel parameters.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'layer_names_only'</span></code></p>
</dd>
<dt><kbd>--strip_plan</kbd></dt>
<dd><p>Enable stripping weights from the final TensorRT engine under the assumption that the refit weights are identical to those provided at build time.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--weight_sparsity</kbd></dt>
<dd><p>Enable weight sparsity.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--weight_streaming</kbd></dt>
<dd><p>Enable offloading weights to CPU and streaming loading at runtime.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--fast_build</kbd></dt>
<dd><p>Enable features for faster engine building. This may cause some performance degradation and is currently incompatible with int8/int4 quantization without plugin.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--workers</kbd></dt>
<dd><p>The number of workers for building in parallel.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">1</span></code></p>
</dd>
<dt><kbd>--log_level</kbd></dt>
<dd><p>Possible choices: internal_error, error, warning, info, verbose, debug</p>
<p>The logging level.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'info'</span></code></p>
</dd>
<dt><kbd>--enable_debug_output</kbd></dt>
<dd><p>Enable debug output.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--visualize_network</kbd></dt>
<dd><p>Export TensorRT Networks to ONNX prior to Engine build for debugging.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--dry_run</kbd></dt>
<dd><p>Run through the build process except the actual Engine build for debugging.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--monitor_memory</kbd></dt>
<dd><p>Enable memory monitor during Engine build.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
</dl>
</section>
<section id="tensorrt_llm.commands.build-parse_arguments-logits-arguments">
<h2>Logits arguments<a class="headerlink" href="#tensorrt_llm.commands.build-parse_arguments-logits-arguments" title="Link to this heading"></a></h2>
<dl class="option-list">
<dt><kbd>--logits_dtype</kbd></dt>
<dd><p>Possible choices: float16, float32</p>
<p>The data type of logits.</p>
</dd>
<dt><kbd>--gather_context_logits</kbd></dt>
<dd><p>Enable gathering context logits.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--gather_generation_logits</kbd></dt>
<dd><p>Enable gathering generation logits.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
<dt><kbd>--gather_all_token_logits</kbd></dt>
<dd><p>Enable both <code class="docutils literal notranslate"><span class="pre">gather_context_logits</span></code> and <code class="docutils literal notranslate"><span class="pre">gather_generation_logits</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">False</span></code></p>
</dd>
</dl>
</section>
<section id="tensorrt_llm.commands.build-parse_arguments-lora-arguments">
<h2>LoRA arguments<a class="headerlink" href="#tensorrt_llm.commands.build-parse_arguments-lora-arguments" title="Link to this heading"></a></h2>
<dl class="option-list">
<dt><kbd>--lora_dir</kbd></dt>
<dd><p>The directory of LoRA weights. If multiple directories are provided, the first one is used for configuration.</p>
</dd>
<dt><kbd>--lora_ckpt_source</kbd></dt>
<dd><p>Possible choices: hf, nemo</p>
<p>The source type of LoRA checkpoint.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'hf'</span></code></p>
</dd>
<dt><kbd>--lora_target_modules</kbd></dt>
<dd><p>Possible choices: attn_qkv, attn_q, attn_k, attn_v, attn_dense, mlp_h_to_4h, mlp_4h_to_h, mlp_gate, cross_attn_qkv, cross_attn_q, cross_attn_k, cross_attn_v, cross_attn_dense, moe_h_to_4h, moe_4h_to_h, moe_gate, moe_router, mlp_router</p>
<p>The target module names that LoRA is applied. Only effective when <code class="docutils literal notranslate"><span class="pre">lora_plugin</span></code> is enabled.</p>
</dd>
<dt><kbd>--max_lora_rank</kbd></dt>
<dd><p>Maximum LoRA rank for different LoRA modules. It is used to compute the workspace size of LoRA plugin.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">64</span></code></p>
</dd>
</dl>
</section>
<section id="tensorrt_llm.commands.build-parse_arguments-speculative-decoding-arguments">
<h2>Speculative decoding arguments<a class="headerlink" href="#tensorrt_llm.commands.build-parse_arguments-speculative-decoding-arguments" title="Link to this heading"></a></h2>
<dl class="option-list">
<dt><kbd>--speculative_decoding_mode</kbd></dt>
<dd><p>Possible choices: draft_tokens_external, lookahead_decoding, medusa, explicit_draft_tokens, eagle</p>
<p>Mode of speculative decoding.</p>
</dd>
<dt><kbd>--max_draft_len</kbd></dt>
<dd><p>Maximum lengths of draft tokens for speculative decoding target model.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">0</span></code></p>
</dd>
</dl>
</section>
<section id="tensorrt_llm.commands.build-parse_arguments-auto-parallel-arguments">
<h2>Auto parallel arguments<a class="headerlink" href="#tensorrt_llm.commands.build-parse_arguments-auto-parallel-arguments" title="Link to this heading"></a></h2>
<dl class="option-list">
<dt><kbd>--auto_parallel</kbd></dt>
<dd><p>MPI world size for auto parallel.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">1</span></code></p>
</dd>
<dt><kbd>--gpus_per_node</kbd></dt>
<dd><p>Number of GPUs each node has in a multi-node setup. This is a cluster spec and can be greater/smaller than world size. This option is only used for auto parallel specified with <code class="docutils literal notranslate"><span class="pre">--auto_parallel</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">8</span></code></p>
</dd>
<dt><kbd>--cluster_key</kbd></dt>
<dd><p>Possible choices: A100-SXM-80GB, A100-SXM-40GB, A100-PCIe-80GB, A100-PCIe-40GB, H100-SXM, H100-PCIe, H20, V100-PCIe-16GB, V100-PCIe-32GB, V100-SXM-16GB, V100-SXM-32GB, V100S-PCIe, A40, A30, A10, A10G, L40S, L40, L20, L4, L2</p>
<p>Unique name for target GPU type. Inferred from current GPU type if not specified. This option is only used for auto parallel specified with <code class="docutils literal notranslate"><span class="pre">--auto_parallel</span></code>.</p>
</dd>
</dl>
</section>
<section id="tensorrt_llm.commands.build-parse_arguments-plugin-config-arguments">
<h2>Plugin config arguments<a class="headerlink" href="#tensorrt_llm.commands.build-parse_arguments-plugin-config-arguments" title="Link to this heading"></a></h2>
<dl class="option-list">
<dt><kbd>--bert_attention_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">bert_attention_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'auto'</span></code></p>
</dd>
<dt><kbd>--gpt_attention_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">gpt_attention_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'auto'</span></code></p>
</dd>
<dt><kbd>--gemm_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, fp8, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">gemm_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--gemm_swiglu_plugin</kbd></dt>
<dd><p>Possible choices: fp8, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">gemm_swiglu_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--fp8_rowwise_gemm_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">fp8_rowwise_gemm_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--nccl_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">nccl_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'auto'</span></code></p>
</dd>
<dt><kbd>--lora_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">lora_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--moe_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">moe_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'auto'</span></code></p>
</dd>
<dt><kbd>--mamba_conv1d_plugin</kbd></dt>
<dd><p>Possible choices: auto, float16, float32, bfloat16, int32, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">mamba_conv1d_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'auto'</span></code></p>
</dd>
<dt><kbd>--low_latency_gemm_plugin</kbd></dt>
<dd><p>Possible choices: fp8, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">low_latency_gemm_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--low_latency_gemm_swiglu_plugin</kbd></dt>
<dd><p>Possible choices: fp8, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">low_latency_gemm_swiglu_plugin</span></code> and the dtype.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--context_fmha</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">context_fmha</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'enable'</span></code></p>
</dd>
<dt><kbd>--bert_context_fmha_fp32_acc</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">bert_context_fmha_fp32_acc</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--remove_input_padding</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">remove_input_padding</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'enable'</span></code></p>
</dd>
<dt><kbd>--reduce_fusion</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">reduce_fusion</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--enable_xqa</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">enable_xqa</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'enable'</span></code></p>
</dd>
<dt><kbd>--tokens_per_block</kbd></dt>
<dd><p><code class="docutils literal notranslate"><span class="pre">tokens_per_block</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">64</span></code></p>
</dd>
<dt><kbd>--use_paged_context_fmha</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">use_paged_context_fmha</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--use_fp8_context_fmha</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">use_fp8_context_fmha</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--multiple_profiles</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">multiple_profiles</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--paged_state</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">paged_state</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'enable'</span></code></p>
</dd>
<dt><kbd>--streamingllm</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">streamingllm</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
<dt><kbd>--use_fused_mlp</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">use_fused_mlp</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'enable'</span></code></p>
</dd>
<dt><kbd>--pp_reduce_scatter</kbd></dt>
<dd><p>Possible choices: enable, disable</p>
<p>Whether to enable/disable <code class="docutils literal notranslate"><span class="pre">pp_reduce_scatter</span></code>.</p>
<p>Default: <code class="docutils literal notranslate"><span class="pre">'disable'</span></code></p>
</dd>
</dl>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="../_cpp_gen/runtime.html" class="btn btn-neutral float-left" title="Runtime" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="trtllm-serve.html" class="btn btn-neutral float-right" title="trtllm-serve" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f9459cecda0>
<div class="footer">
<p>
Copyright © 2024 NVIDIA Corporation
</p>
<p>
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
data-cms-ai="0">Privacy Policy</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
data-cms-ai="0">Manage My Privacy</a> |
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
data-cms-ai="0">Do Not Sell or Share My Data</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
rel="noopener" data-cms-ai="0">Terms of Service</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
data-cms-ai="0">Accessibility</a> |
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
data-cms-ai="0">Product Security</a> |
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
data-cms-ai="0">Contact</a>
</p>
</div>
</div>
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>