mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-26 05:32:57 +08:00
Update latest GitHub pages to v0.20.0rc3
This commit is contained in:
parent
90385c8e86
commit
82845d2c23
4
latest/.buildinfo
Normal file
4
latest/.buildinfo
Normal file
@ -0,0 +1,4 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: 12c1352bd1428d2c6ac709024163b9d8
|
||||
tags: 645f666f9bcd5a90fca523b33c5a78b7
|
||||
0
latest/.nojekyll
Normal file
0
latest/.nojekyll
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
1449
latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
Normal file
1449
latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1805,7 +1807,7 @@
|
||||
|
||||
<span class="n">p_dtype</span> <span class="o">=</span> <span class="n">default_net</span><span class="p">()</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">gemm_swiglu_plugin</span>
|
||||
<span class="k">if</span> <span class="n">p_dtype</span> <span class="o">==</span> <span class="s2">"fp8"</span><span class="p">:</span>
|
||||
<span class="k">assert</span> <span class="n">bias</span> <span class="o">==</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"fp8 gemm_swiglu does not support bias yet"</span>
|
||||
<span class="k">assert</span> <span class="n">bias</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"fp8 gemm_swiglu does not support bias yet"</span>
|
||||
|
||||
<span class="n">pf_type</span> <span class="o">=</span> <span class="n">trt</span><span class="o">.</span><span class="n">PluginField</span><span class="p">(</span>
|
||||
<span class="s2">"type_id"</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="nb">int</span><span class="p">(</span><span class="n">str_dtype_to_trt</span><span class="p">(</span><span class="n">p_dtype</span><span class="p">))],</span> <span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">),</span>
|
||||
@ -3547,7 +3549,7 @@
|
||||
<span class="c1"># Distribute embedding lookup table across multiple GPU</span>
|
||||
<span class="k">if</span> <span class="n">tp_size</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">and</span> <span class="n">tp_group</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">sharding_dim</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="c1"># TP on vocab_size dimension</span>
|
||||
<span class="k">if</span> <span class="n">tp_rank</span> <span class="o">==</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">tp_rank</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"Rank cannot be none for tensor parallelism on vocab dim"</span><span class="p">)</span>
|
||||
|
||||
@ -5237,7 +5239,7 @@
|
||||
<span class="k">assert</span> <span class="n">a</span><span class="o">.</span><span class="n">dtype</span> <span class="o">==</span> <span class="n">trt</span><span class="o">.</span><span class="n">fp8</span>
|
||||
<span class="k">assert</span> <span class="n">b</span><span class="o">.</span><span class="n">dtype</span> <span class="o">==</span> <span class="n">trt</span><span class="o">.</span><span class="n">fp8</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">output_dtype</span> <span class="o">==</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">output_dtype</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">output_dtype</span> <span class="o">=</span> <span class="n">str_dtype_to_trt</span><span class="p">(</span>
|
||||
<span class="n">default_net</span><span class="p">()</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">gemm_allreduce_plugin</span><span class="p">)</span>
|
||||
<span class="k">assert</span> <span class="n">output_dtype</span> <span class="ow">in</span> <span class="p">[</span><span class="n">trt</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span> <span class="n">trt</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">]</span>
|
||||
@ -5338,7 +5340,10 @@
|
||||
<span class="n">sage_attn</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">sage_attn_q_block_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">sage_attn_k_block_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">sage_attn_v_block_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]:</span>
|
||||
<span class="n">sage_attn_v_block_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">cp_group</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
|
||||
<span class="n">cp_rank</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]:</span>
|
||||
<span class="w"> </span><span class="sd">'''</span>
|
||||
<span class="sd"> Add an operation that performs the multi-head attention in BERT.</span>
|
||||
|
||||
@ -5405,6 +5410,15 @@
|
||||
<span class="sd"> sage_attn_v_quant_size: int = 0</span>
|
||||
<span class="sd"> dynamic quant block size along sequence dimension of v tensor. Each quant block will share one scale.</span>
|
||||
|
||||
<span class="sd"> cp_group: list[int] = None</span>
|
||||
<span class="sd"> The communication group for context parallel</span>
|
||||
|
||||
<span class="sd"> cp_size: int = 1</span>
|
||||
<span class="sd"> The communication size for context parallel</span>
|
||||
|
||||
<span class="sd"> cp_rank: int = 0</span>
|
||||
<span class="sd"> The communication rank for context parallel</span>
|
||||
|
||||
<span class="sd"> Returns:</span>
|
||||
<span class="sd"> The tensor produced by that layer.</span>
|
||||
<span class="sd"> '''</span>
|
||||
@ -5459,10 +5473,31 @@
|
||||
<span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">sage_attn_v_block_size</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">),</span>
|
||||
<span class="n">trt</span><span class="o">.</span><span class="n">PluginFieldType</span><span class="o">.</span><span class="n">INT32</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">cp_size</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
|
||||
<span class="c1"># transpose q,k,v inside qkv to make kv contiguous, which is required by ring attention</span>
|
||||
<span class="c1"># (b, s, 3d)</span>
|
||||
<span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="n">chunk</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
|
||||
<span class="n">bs</span> <span class="o">=</span> <span class="n">shape</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
|
||||
<span class="n">seq_len</span> <span class="o">=</span> <span class="n">shape</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||||
<span class="c1"># (b, s, d) -> (b, s, 2d) -> (2b, s, d)</span>
|
||||
<span class="n">kv</span> <span class="o">=</span> <span class="n">concat</span><span class="p">([</span><span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">],</span>
|
||||
<span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">concat</span><span class="p">((</span><span class="mi">2</span> <span class="o">*</span> <span class="n">bs</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">query</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">])))</span>
|
||||
<span class="n">tensor</span> <span class="o">=</span> <span class="n">concat</span><span class="p">((</span><span class="n">query</span><span class="p">,</span> <span class="n">kv</span><span class="p">),</span>
|
||||
<span class="n">dim</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">concat</span><span class="p">((</span><span class="n">bs</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">query</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">*</span> <span class="mi">3</span><span class="p">)))</span>
|
||||
|
||||
<span class="n">cp_size</span> <span class="o">=</span> <span class="n">trt</span><span class="o">.</span><span class="n">PluginField</span><span class="p">(</span><span class="s2">"cp_size"</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">cp_size</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">),</span>
|
||||
<span class="n">trt</span><span class="o">.</span><span class="n">PluginFieldType</span><span class="o">.</span><span class="n">INT32</span><span class="p">)</span>
|
||||
<span class="n">cp_rank</span> <span class="o">=</span> <span class="n">trt</span><span class="o">.</span><span class="n">PluginField</span><span class="p">(</span><span class="s2">"cp_rank"</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">cp_rank</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">),</span>
|
||||
<span class="n">trt</span><span class="o">.</span><span class="n">PluginFieldType</span><span class="o">.</span><span class="n">INT32</span><span class="p">)</span>
|
||||
<span class="n">cp_group</span> <span class="o">=</span> <span class="n">cp_group</span> <span class="ow">or</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="n">cp_group</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">cp_group</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)</span>
|
||||
<span class="n">cp_group</span> <span class="o">=</span> <span class="n">trt</span><span class="o">.</span><span class="n">PluginField</span><span class="p">(</span><span class="s2">"cp_group"</span><span class="p">,</span> <span class="n">cp_group</span><span class="p">,</span> <span class="n">trt</span><span class="o">.</span><span class="n">PluginFieldType</span><span class="o">.</span><span class="n">INT32</span><span class="p">)</span>
|
||||
|
||||
<span class="n">pfc</span> <span class="o">=</span> <span class="n">trt</span><span class="o">.</span><span class="n">PluginFieldCollection</span><span class="p">([</span>
|
||||
<span class="n">nheads</span><span class="p">,</span> <span class="n">head_size</span><span class="p">,</span> <span class="n">q_scaling</span><span class="p">,</span> <span class="n">context_fmha_type</span><span class="p">,</span> <span class="n">pf_type</span><span class="p">,</span>
|
||||
<span class="n">do_relative_attention</span><span class="p">,</span> <span class="n">max_distance</span><span class="p">,</span> <span class="n">remove_padding</span><span class="p">,</span> <span class="n">sage_attn</span><span class="p">,</span>
|
||||
<span class="n">sage_attn_q_block_size</span><span class="p">,</span> <span class="n">sage_attn_k_block_size</span><span class="p">,</span> <span class="n">sage_attn_v_block_size</span>
|
||||
<span class="n">sage_attn_q_block_size</span><span class="p">,</span> <span class="n">sage_attn_k_block_size</span><span class="p">,</span> <span class="n">sage_attn_v_block_size</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="p">,</span> <span class="n">cp_rank</span><span class="p">,</span> <span class="n">cp_group</span>
|
||||
<span class="p">])</span>
|
||||
|
||||
<span class="n">attn_plug</span> <span class="o">=</span> <span class="n">attn_plg_creator</span><span class="o">.</span><span class="n">create_plugin</span><span class="p">(</span><span class="s2">"padding_attn"</span><span class="p">,</span> <span class="n">pfc</span><span class="p">)</span>
|
||||
@ -5684,6 +5719,7 @@
|
||||
<span class="n">beta_slow</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
|
||||
<span class="n">mscale</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
|
||||
<span class="n">mscale_all_dim</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1.0</span><span class="p">,</span>
|
||||
<span class="n">duplicate_data</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">):</span>
|
||||
|
||||
<span class="c1"># Copy from https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/main/modeling_deepseek.py</span>
|
||||
@ -5741,23 +5777,25 @@
|
||||
<span class="n">inv_freq_mask</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">-</span> <span class="n">yarn_linear_ramp_mask</span><span class="p">(</span><span class="n">low</span><span class="p">,</span> <span class="n">high</span><span class="p">,</span>
|
||||
<span class="n">dim</span> <span class="o">//</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
|
||||
<span class="n">inv_freq</span> <span class="o">=</span> <span class="n">freq_inter</span> <span class="o">*</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">inv_freq_mask</span><span class="p">)</span> <span class="o">+</span> <span class="n">freq_extra</span> <span class="o">*</span> <span class="n">inv_freq_mask</span>
|
||||
<span class="n">t</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">num_pos</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
|
||||
|
||||
<span class="n">freqs</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">outer</span><span class="p">(</span><span class="n">t</span><span class="p">,</span> <span class="n">inv_freq</span><span class="p">)</span>
|
||||
<span class="n">sinusoid_inp</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">expand_dims</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s2">"i , j -> i j"</span><span class="p">,</span>
|
||||
<span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">num_pos</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">),</span>
|
||||
<span class="n">inv_freq</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">),</span>
|
||||
<span class="n">axis</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
|
||||
|
||||
<span class="n">_mscale</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span>
|
||||
<span class="n">yarn_get_mscale</span><span class="p">(</span><span class="n">scaling_factor</span><span class="p">,</span> <span class="n">mscale</span><span class="p">)</span> <span class="o">/</span>
|
||||
<span class="n">yarn_get_mscale</span><span class="p">(</span><span class="n">scaling_factor</span><span class="p">,</span> <span class="n">mscale_all_dim</span><span class="p">))</span>
|
||||
|
||||
<span class="n">emb</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">concatenate</span><span class="p">((</span><span class="n">freqs</span><span class="p">,</span> <span class="n">freqs</span><span class="p">),</span> <span class="n">axis</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">duplicate_data</span><span class="p">:</span>
|
||||
<span class="n">emb</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">concatenate</span><span class="p">((</span><span class="n">sinusoid_inp</span><span class="p">,</span> <span class="n">sinusoid_inp</span><span class="p">),</span> <span class="n">axis</span><span class="o">=-</span><span class="mi">2</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">emb</span> <span class="o">=</span> <span class="n">sinusoid_inp</span>
|
||||
|
||||
<span class="n">concat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">concatenate</span><span class="p">((</span><span class="n">np</span><span class="o">.</span><span class="n">cos</span><span class="p">(</span><span class="n">emb</span><span class="p">)</span> <span class="o">*</span> <span class="n">_mscale</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="n">emb</span><span class="p">)</span> <span class="o">*</span> <span class="n">_mscale</span><span class="p">),</span>
|
||||
<span class="n">axis</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
|
||||
|
||||
<span class="n">concat</span> <span class="o">=</span> <span class="n">concat</span><span class="o">.</span><span class="n">reshape</span><span class="p">((</span><span class="n">num_pos</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">dim</span><span class="p">))</span>
|
||||
<span class="n">concat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="n">concat</span><span class="p">,</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">concat</span><span class="o">.</span><span class="n">reshape</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span></div>
|
||||
<span class="k">return</span> <span class="n">inv_freq</span><span class="p">,</span> <span class="n">concat</span><span class="o">.</span><span class="n">reshape</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="RopeEmbeddingUtils.rotate_every_two">
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1224,6 +1226,34 @@
|
||||
<span class="n">is_buffer</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
|
||||
<span class="n">model_cls</span><span class="o">.</span><span class="n">short_mscale</span> <span class="o">=</span> <span class="n">short_mscale</span>
|
||||
<span class="n">model_cls</span><span class="o">.</span><span class="n">long_mscale</span> <span class="o">=</span> <span class="n">long_mscale</span>
|
||||
<span class="k">elif</span> <span class="n">rotary_embedding_scale_type</span> <span class="o">==</span> <span class="n">RotaryScalingType</span><span class="o">.</span><span class="n">yarn</span><span class="p">:</span>
|
||||
<span class="n">beta_fast</span> <span class="o">=</span> <span class="n">rotary_embedding_scaling</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"beta_fast"</span><span class="p">,</span> <span class="mf">32.0</span><span class="p">)</span>
|
||||
<span class="n">beta_slow</span> <span class="o">=</span> <span class="n">rotary_embedding_scaling</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"beta_slow"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)</span>
|
||||
<span class="n">mscale</span> <span class="o">=</span> <span class="n">rotary_embedding_scaling</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"mscale"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)</span>
|
||||
<span class="n">mscale_all_dim</span> <span class="o">=</span> <span class="n">rotary_embedding_scaling</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"mscale_all_dim"</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)</span>
|
||||
<span class="n">original_max_position_embeddings</span> <span class="o">=</span> <span class="n">rotary_embedding_scaling</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s2">"original_max_position_embeddings"</span><span class="p">,</span> <span class="mi">4096</span><span class="p">)</span>
|
||||
<span class="n">rotary_inv_freq</span><span class="p">,</span> <span class="n">embed_positions_for_gpt_attention</span> <span class="o">=</span> <span class="n">RopeEmbeddingUtils</span><span class="o">.</span><span class="n">create_sinusoidal_positions_yarn</span><span class="p">(</span>
|
||||
<span class="n">max_position_embeddings</span><span class="p">,</span> <span class="n">rotary_embedding_dim</span><span class="p">,</span>
|
||||
<span class="n">rotary_embedding_base</span><span class="p">,</span> <span class="n">rotary_embedding_scale</span><span class="p">,</span>
|
||||
<span class="n">original_max_position_embeddings</span><span class="p">,</span> <span class="n">beta_fast</span><span class="p">,</span> <span class="n">beta_slow</span><span class="p">,</span> <span class="n">mscale</span><span class="p">,</span>
|
||||
<span class="n">mscale_all_dim</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
|
||||
|
||||
<span class="n">embed_positions</span> <span class="o">=</span> <span class="n">RopeEmbeddingUtils</span><span class="o">.</span><span class="n">create_sinusoidal_positions</span><span class="p">(</span>
|
||||
<span class="n">max_position_embeddings</span><span class="p">,</span>
|
||||
<span class="n">rotary_embedding_dim</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
<span class="n">model_cls</span><span class="o">.</span><span class="n">register_parameter</span><span class="p">(</span>
|
||||
<span class="s1">'embed_positions'</span><span class="p">,</span>
|
||||
<span class="n">Parameter</span><span class="p">(</span><span class="n">embed_positions</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">'float32'</span><span class="p">,</span> <span class="n">is_buffer</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
|
||||
<span class="n">model_cls</span><span class="o">.</span><span class="n">register_parameter</span><span class="p">(</span>
|
||||
<span class="s1">'rotary_inv_freq'</span><span class="p">,</span>
|
||||
<span class="n">Parameter</span><span class="p">(</span><span class="n">rotary_inv_freq</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">'float32'</span><span class="p">,</span> <span class="n">is_buffer</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
|
||||
<span class="n">model_cls</span><span class="o">.</span><span class="n">register_parameter</span><span class="p">(</span>
|
||||
<span class="s1">'embed_positions_for_gpt_attention'</span><span class="p">,</span>
|
||||
<span class="n">Parameter</span><span class="p">(</span><span class="n">embed_positions_for_gpt_attention</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="s1">'float32'</span><span class="p">,</span>
|
||||
<span class="n">is_buffer</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">register_rope_params</span><span class="p">(</span><span class="n">rotary_base</span><span class="p">,</span> <span class="n">names_to_register</span><span class="p">):</span>
|
||||
@ -1505,7 +1535,7 @@
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s1">'kv'</span><span class="p">):</span>
|
||||
<span class="c1"># We optimize the graph by adding kv in the cross attention layer, preventing computing the</span>
|
||||
<span class="c1"># query of encoder_output.</span>
|
||||
<span class="k">assert</span> <span class="n">qkv_lora_params</span> <span class="o">==</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Not support LoRA when we only compute key/value in cross atteniton"</span>
|
||||
<span class="k">assert</span> <span class="n">qkv_lora_params</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Not support LoRA when we only compute key/value in cross atteniton"</span>
|
||||
<span class="c1"># see optimization_model's optimize_cross_qkv</span>
|
||||
<span class="n">cross_kv</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv</span><span class="p">(</span><span class="n">encoder_output</span><span class="p">,</span> <span class="n">qkv_lora_params</span><span class="p">)</span>
|
||||
<span class="n">base_shape</span> <span class="o">=</span> <span class="n">shape</span><span class="p">(</span>
|
||||
@ -2176,6 +2206,7 @@
|
||||
<span class="n">tp_rank</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">cp_group</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
|
||||
<span class="n">cp_rank</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">relative_attention</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">max_distance</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">num_buckets</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
@ -2196,6 +2227,7 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">tp_rank</span> <span class="o">=</span> <span class="n">tp_rank</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cp_group</span> <span class="o">=</span> <span class="n">cp_group</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">=</span> <span class="n">cp_size</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cp_rank</span> <span class="o">=</span> <span class="n">cp_rank</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">num_layers</span> <span class="o">=</span> <span class="n">num_layers</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">apply_query_key_layer_scaling</span> <span class="o">=</span> <span class="n">apply_query_key_layer_scaling</span>
|
||||
@ -2295,7 +2327,6 @@
|
||||
<span class="k">if</span> <span class="n">default_net</span><span class="p">()</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">bert_attention_plugin</span><span class="p">:</span>
|
||||
<span class="c1"># TRT plugin mode</span>
|
||||
<span class="k">assert</span> <span class="n">input_lengths</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">==</span> <span class="mi">1</span>
|
||||
<span class="k">assert</span> <span class="n">get_sm_version</span><span class="p">()</span> <span class="o"><</span> <span class="mi">100</span> <span class="ow">or</span> <span class="n">get_sm_version</span><span class="p">()</span> <span class="o">>=</span> <span class="mi">120</span><span class="p">,</span> \
|
||||
<span class="s2">"bert_attention_plugin does not support SM100"</span>
|
||||
<span class="n">context</span> <span class="o">=</span> <span class="n">bert_attention</span><span class="p">(</span>
|
||||
@ -2308,7 +2339,10 @@
|
||||
<span class="n">max_distance</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">max_distance</span><span class="p">,</span>
|
||||
<span class="n">relative_attention_bias</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">rel_attn_table</span><span class="o">.</span><span class="n">value</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">relative_attention</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">max_input_length</span><span class="o">=</span><span class="n">max_input_length</span><span class="p">)</span>
|
||||
<span class="n">max_input_length</span><span class="o">=</span><span class="n">max_input_length</span><span class="p">,</span>
|
||||
<span class="n">cp_group</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_group</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_size</span><span class="p">,</span>
|
||||
<span class="n">cp_rank</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_rank</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="c1"># plain TRT mode</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">transpose_for_scores</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
|
||||
@ -2628,7 +2662,7 @@
|
||||
<span class="n">mscale</span> <span class="o">=</span> <span class="n">yarn_get_mscale</span><span class="p">(</span><span class="n">scaling_factor</span><span class="p">,</span> <span class="n">mscale_all_dim</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">q_scaling</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="p">(</span><span class="n">mscale</span> <span class="o">*</span> <span class="n">mscale</span><span class="p">)</span>
|
||||
|
||||
<span class="n">embed_positions_for_gpt_attention</span> <span class="o">=</span> <span class="n">RopeEmbeddingUtils</span><span class="o">.</span><span class="n">create_sinusoidal_positions_yarn</span><span class="p">(</span>
|
||||
<span class="n">_</span><span class="p">,</span> <span class="n">embed_positions_for_gpt_attention</span> <span class="o">=</span> <span class="n">RopeEmbeddingUtils</span><span class="o">.</span><span class="n">create_sinusoidal_positions_yarn</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_position_embeddings</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">qk_rope_head_dim</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">rotary_embedding_base</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">rotary_scaling</span><span class="p">[</span><span class="s2">"factor"</span><span class="p">],</span>
|
||||
<span class="n">rotary_embedding_origin_max_position</span><span class="p">,</span> <span class="n">rotary_embedding_beta_fast</span><span class="p">,</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -814,8 +816,9 @@
|
||||
<span class="n">inputs</span> <span class="o">=</span> <span class="n">prompt_inputs</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="n">inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"prompt"</span><span class="p">)</span> <span class="ow">and</span> <span class="n">inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s2">"prompt_token_ids"</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">,</span>
|
||||
<span class="n">DefaultInputProcessor</span><span class="p">):</span>
|
||||
<span class="s2">"prompt_token_ids"</span><span class="p">)</span> <span class="ow">and</span> <span class="n">inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s2">"multi_modal_data"</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">,</span> <span class="n">DefaultInputProcessor</span><span class="p">):</span>
|
||||
<span class="c1"># VLMs need to process/tokenize the prompt in their own way</span>
|
||||
<span class="n">prompt</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">inputs</span><span class="p">[</span><span class="s1">'prompt_token_ids'</span><span class="p">])</span>
|
||||
<span class="n">inputs</span> <span class="o">=</span> <span class="n">TextPrompt</span><span class="p">(</span>
|
||||
@ -1017,7 +1020,7 @@
|
||||
|
||||
<span class="k">if</span> <span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">enable_chunked_prefill</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span>
|
||||
<span class="n">prompt_len</span> <span class="o">/</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">+</span> <span class="n">query_len</span> <span class="o">+</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="o">></span> <span class="n">max_seq_len</span><span class="p">):</span>
|
||||
<span class="p">(</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">)</span> <span class="o">></span> <span class="n">max_seq_len</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">) and query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) max_tokens (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span><span class="si">}</span><span class="s2">) should not exceed "</span>
|
||||
<span class="sa">f</span><span class="s2">"max_seq_len (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span>
|
||||
@ -1073,6 +1076,14 @@
|
||||
<span class="n">max_batch_size</span><span class="o">=</span><span class="n">max_batch_size</span><span class="p">,</span>
|
||||
<span class="n">max_num_tokens</span><span class="o">=</span><span class="n">max_num_tokens</span><span class="p">,</span>
|
||||
<span class="n">gather_generation_logits</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">gather_generation_logits</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="c1"># also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens</span>
|
||||
<span class="k">if</span> <span class="n">max_seq_len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="o">=</span> <span class="n">max_seq_len</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">engine_config</span> <span class="o">=</span> <span class="n">EngineConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span> <span class="o">/</span>
|
||||
<span class="s2">"config.json"</span><span class="p">)</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="o">=</span> <span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="p">)</span>
|
||||
@ -1160,13 +1171,27 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">runtime_context</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">runtime_context</span><span class="o">.</span><span class="n">tokenizer</span>
|
||||
|
||||
<span class="c1"># TODO smor- need to look more on this</span>
|
||||
<span class="c1"># what should be chose as the tokenizer? the adapter or the base model?</span>
|
||||
<span class="c1"># what happens if we have multiple adapters?</span>
|
||||
<span class="c1"># TODO smor- need to refine what is the desired behavior if lora is enabled</span>
|
||||
<span class="c1"># in terms of the tokenizer initialization process</span>
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span> <span class="s2">"backend"</span>
|
||||
<span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="n">num_lora_dirs</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">num_lora_dirs</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
|
||||
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="k">try</span><span class="p">:</span>
|
||||
<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">ModelLoader</span><span class="o">.</span><span class="n">load_hf_tokenizer</span><span class="p">(</span>
|
||||
<span class="n">tokenizer_path</span><span class="p">,</span>
|
||||
<span class="n">trust_remote_code</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">trust_remote_code</span><span class="p">,</span>
|
||||
<span class="n">use_fast</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">tokenizer_mode</span> <span class="o">!=</span> <span class="s1">'slow'</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">tokenizer</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">model</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">tokenizer</span>
|
||||
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
|
||||
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">model</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">model</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">model</span>
|
||||
<span class="k">return</span> <span class="n">ModelLoader</span><span class="o">.</span><span class="n">load_hf_tokenizer</span><span class="p">(</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -515,7 +517,8 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">strenum</span><span class="w"> </span><span class="kn">import</span> <span class="n">StrEnum</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.lora_manager</span><span class="w"> </span><span class="kn">import</span> <span class="n">LoraConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.lora_manager</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LoraConfig</span><span class="p">,</span>
|
||||
<span class="n">get_default_trtllm_modules_to_hf_modules</span><span class="p">)</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..auto_parallel</span><span class="w"> </span><span class="kn">import</span> <span class="n">AutoParallelConfig</span><span class="p">,</span> <span class="n">infer_cluster_config</span>
|
||||
@ -1356,13 +1359,22 @@
|
||||
<span class="c1"># LoRA arguments</span>
|
||||
<span class="n">enable_lora</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable LoRA."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">max_lora_rank</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum LoRA rank."</span><span class="p">)</span>
|
||||
<span class="n">max_lora_rank</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum LoRA rank."</span><span class="p">,</span>
|
||||
<span class="n">deprecated</span><span class="o">=</span><span class="s2">"Use lora_config.max_lora_rank instead."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">max_loras</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"The maximum number of LoRA."</span><span class="p">)</span>
|
||||
<span class="n">max_loras</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum number of LoRA."</span><span class="p">,</span>
|
||||
<span class="n">deprecated</span><span class="o">=</span><span class="s2">"Use lora_config.max_loras instead."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">max_cpu_loras</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum number of LoRA on CPU."</span><span class="p">)</span>
|
||||
<span class="n">max_cpu_loras</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum number of LoRA on CPU."</span><span class="p">,</span>
|
||||
<span class="n">deprecated</span><span class="o">=</span><span class="s2">"Use lora_config.max_cpu_loras instead."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">lora_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">LoraConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"LoRA configuration for the model."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Prompt adapter arguments</span>
|
||||
<span class="n">enable_prompt_adapter</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
@ -1475,10 +1487,6 @@
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The backend to use."</span><span class="p">,</span>
|
||||
<span class="n">exclude</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># TODO smor- this is an experimental feature and is probably subject to change before 1.0 release</span>
|
||||
<span class="n">lora_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">LoraConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"LoRA configuration for the model."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># private fields those are unstable and just for internal use</span>
|
||||
<span class="n">num_postprocess_workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
@ -1718,7 +1726,9 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">_world_size</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">nccl_plugin</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_ensure_lora_config_consistency</span><span class="p">()</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">!=</span> <span class="s1">'pytorch'</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span> <span class="o">=</span> <span class="s1">'auto'</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span>
|
||||
@ -1788,11 +1798,41 @@
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_ensure_lora_config_consistency</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"max_lora_rank is ignored when lora_config is provided."</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_loras</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_loras</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"max_loras is ignored when lora_config is provided."</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_cpu_loras</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_cpu_loras</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"max_cpu_loras is ignored when lora_config is provided."</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
|
||||
<span class="c1"># TODO [TRTLLM-5173]</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"lora_dir is empty, so custom embedding or lm head will not be applied."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">'pytorch'</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"Lora is an experimental feature and is probably subject to change before 1.0 release"</span>
|
||||
<span class="s2">"enable_lora is ignored when lora_config is provided for pytorch backend."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_target_modules</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"Both lora_dir and lora_target_modules are empty, so all LoRA modules will be expected. "</span>
|
||||
<span class="s2">"This will lead to serious memory consumption. Please provide either lora_dir or lora_target_modules if this behavior is not what you expect."</span>
|
||||
<span class="p">)</span>
|
||||
<span class="n">default_trtllm_modules_to_hf_modules</span> <span class="o">=</span> <span class="n">get_default_trtllm_modules_to_hf_modules</span><span class="p">(</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_target_modules</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span>
|
||||
<span class="n">default_trtllm_modules_to_hf_modules</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_build_config_mutable</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">TLLM_ENGINE</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -852,6 +854,9 @@
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> <span class="c1"># should get a True if success</span>
|
||||
<span class="k">return</span> <span class="kc">False</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">abort</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">wait</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_is_shutdown</span><span class="p">:</span>
|
||||
<span class="k">return</span>
|
||||
@ -868,7 +873,7 @@
|
||||
<span class="k">finally</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_is_shutdown</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">abort</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">shutdown_abort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">grace</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">60</span><span class="p">,</span> <span class="n">reason</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
|
||||
|
||||
|
||||
@ -935,7 +940,7 @@
|
||||
<span class="n">print_colored_debug</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"RemoteMpiCommSessionServer [rank</span><span class="si">{</span><span class="n">global_mpi_rank</span><span class="p">()</span><span class="si">}</span><span class="s2">] received shutdown signal</span><span class="se">\n</span><span class="s2">"</span><span class="p">,</span>
|
||||
<span class="s2">"green"</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">shutdown_abort</span><span class="p">()</span>
|
||||
<span class="k">break</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">print_colored_debug</span><span class="p">(</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -637,6 +639,7 @@
|
||||
<span class="n">tp_rank</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_rank</span><span class="p">,</span>
|
||||
<span class="n">cp_group</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_group</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="p">,</span>
|
||||
<span class="n">cp_rank</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_rank</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
|
||||
<span class="n">quant_mode</span><span class="o">=</span><span class="n">quant_mode</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">norm2</span> <span class="o">=</span> <span class="n">LayerNorm</span><span class="p">(</span><span class="n">hidden_size</span><span class="p">,</span> <span class="n">elementwise_affine</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">eps</span><span class="o">=</span><span class="mf">1e-6</span><span class="p">)</span>
|
||||
@ -815,6 +818,7 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span>
|
||||
<span class="k">assert</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">==</span> <span class="mi">0</span>
|
||||
<span class="n">x</span> <span class="o">=</span> <span class="n">chunk</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="p">,</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)[</span><span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_rank</span><span class="p">]</span>
|
||||
<span class="n">input_lengths</span> <span class="o">=</span> <span class="n">input_lengths</span> <span class="o">//</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span>
|
||||
<span class="k">for</span> <span class="n">block</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">blocks</span><span class="p">:</span>
|
||||
<span class="n">x</span> <span class="o">=</span> <span class="n">block</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">input_lengths</span><span class="p">)</span> <span class="c1"># (N, T, D)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">register_network_output</span><span class="p">(</span><span class="s1">'before_final_layer'</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -647,6 +649,11 @@
|
||||
<span class="c1"># InternLM-XComposer2 has a mask for partial lora</span>
|
||||
<span class="c1"># Therefore we need an additional flag for this mask</span>
|
||||
<span class="n">has_partial_lora_mask</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="k">if</span> <span class="n">hf_config</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">'mistral3'</span><span class="p">:</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">Mistral3Config</span>
|
||||
<span class="n">hf_config</span> <span class="o">=</span> <span class="n">Mistral3Config</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
|
||||
<span class="n">hf_config_dir</span><span class="p">)</span><span class="o">.</span><span class="n">text_config</span>
|
||||
<span class="n">hf_config</span><span class="o">.</span><span class="n">architectures</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"MistralForCausalLM"</span><span class="p">]</span>
|
||||
|
||||
<span class="n">num_key_value_heads</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">hf_config</span><span class="p">,</span> <span class="s2">"num_key_value_heads"</span><span class="p">,</span>
|
||||
<span class="n">hf_config</span><span class="o">.</span><span class="n">num_attention_heads</span><span class="p">)</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -670,8 +672,7 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">encoder_runtime_mapping</span><span class="o">.</span><span class="n">has_pp</span><span class="p">():</span>
|
||||
<span class="c1"># for Pipeline Parallelism in encoder</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">nccl_comm</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">classes</span><span class="o">.</span><span class="n">trtllm</span><span class="o">.</span><span class="n">NcclCommunicatorOp</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">encoder_runtime_mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">encoder_runtime_mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">encoder_runtime_mapping</span><span class="o">.</span><span class="n">world_size</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">encoder_runtime_mapping</span><span class="o">.</span><span class="n">rank</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># session setup</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1444,7 +1446,7 @@
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">has_pp</span><span class="p">():</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">nccl_comm</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">classes</span><span class="o">.</span><span class="n">trtllm</span><span class="o">.</span><span class="n">NcclCommunicatorOp</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">rank</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">world_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">rank</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">is_last_pp_rank</span><span class="p">():</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoder_logits_dtype</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_tensor_dtype</span><span class="p">(</span><span class="s1">'logits'</span><span class="p">)</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -529,8 +531,8 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">DataType</span><span class="p">,</span> <span class="n">GptJsonConfig</span><span class="p">,</span> <span class="n">KVCacheType</span><span class="p">,</span> <span class="n">ModelConfig</span><span class="p">,</span>
|
||||
<span class="n">WorldConfig</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings</span><span class="w"> </span><span class="kn">import</span> <span class="n">executor</span> <span class="k">as</span> <span class="n">trtllm</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">ExternalDraftTokensConfig</span><span class="p">,</span> <span class="n">OrchestratorConfig</span><span class="p">,</span>
|
||||
<span class="n">ParallelConfig</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">DecodingMode</span><span class="p">,</span> <span class="n">ExternalDraftTokensConfig</span><span class="p">,</span>
|
||||
<span class="n">OrchestratorConfig</span><span class="p">,</span> <span class="n">ParallelConfig</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..builder</span><span class="w"> </span><span class="kn">import</span> <span class="n">EngineConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..layers</span><span class="w"> </span><span class="kn">import</span> <span class="n">MropeParams</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
|
||||
@ -855,6 +857,10 @@
|
||||
<span class="n">decoding_config</span><span class="o">.</span><span class="n">lookahead_decoding_config</span> <span class="o">=</span> <span class="n">trtllm</span><span class="o">.</span><span class="n">LookaheadDecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">w</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">g</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">use_variable_beam_width_search</span><span class="p">:</span>
|
||||
<span class="n">decoding_config</span><span class="o">.</span><span class="n">decoding_mode</span> <span class="o">=</span> <span class="n">DecodingMode</span><span class="o">.</span><span class="n">BeamSearch</span><span class="p">(</span>
|
||||
<span class="p">)</span><span class="o">.</span><span class="n">useVariableBeamWidthSearch</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">max_batch_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">model_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
@ -897,7 +903,6 @@
|
||||
<span class="n">use_gpu_direct_storage</span><span class="o">=</span><span class="n">use_gpu_direct_storage</span><span class="p">,</span>
|
||||
<span class="n">gpu_weights_percent</span><span class="o">=</span><span class="n">gpu_weights_percent</span><span class="p">,</span>
|
||||
<span class="n">gather_generation_logits</span><span class="o">=</span><span class="n">gather_generation_logits</span><span class="p">,</span>
|
||||
<span class="n">use_variable_beam_width_search</span><span class="o">=</span><span class="n">use_variable_beam_width_search</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
<span class="n">trtllm_config</span><span class="o">.</span><span class="n">enable_chunked_context</span> <span class="o">=</span> <span class="n">enable_chunked_context</span>
|
||||
<span class="n">trtllm_config</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="o">=</span> <span class="n">extended_runtime_perf_knob_config</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -918,6 +920,13 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_frames</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">num_frames</span> <span class="o">=</span> <span class="mi">8</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">video_path</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">image_path</span> <span class="ow">is</span> <span class="kc">None</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s2">"pixtral"</span><span class="p">:</span>
|
||||
<span class="n">hf_config</span> <span class="o">=</span> <span class="n">AutoConfig</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">hf_model_dir</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">image_size</span> <span class="o">=</span> <span class="n">hf_config</span><span class="o">.</span><span class="n">vision_config</span><span class="o">.</span><span class="n">image_size</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">patch_size</span> <span class="o">=</span> <span class="n">hf_config</span><span class="o">.</span><span class="n">vision_config</span><span class="o">.</span><span class="n">patch_size</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span> <span class="o">=</span> <span class="n">hf_config</span><span class="o">.</span><span class="n">text_config</span><span class="o">.</span><span class="n">vocab_size</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">image_token_index</span> <span class="o">=</span> <span class="n">hf_config</span><span class="o">.</span><span class="n">image_token_index</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">spatial_merge_size</span> <span class="o">=</span> <span class="n">hf_config</span><span class="o">.</span><span class="n">spatial_merge_size</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">audio_input_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">audio_output_names</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s2">"mllama"</span><span class="p">:</span>
|
||||
@ -1127,6 +1136,10 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">processor</span> <span class="o">=</span> <span class="n">AutoProcessor</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">hf_model_dir</span><span class="p">,</span> <span class="n">trust_remote_code</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">num_crops</span><span class="o">=</span><span class="mi">16</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="s1">'pixtral'</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">processor</span> <span class="o">=</span> <span class="n">AutoProcessor</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">hf_model_dir</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="s1">'internlm'</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
|
||||
<span class="n">image_size</span> <span class="o">=</span> <span class="mi">490</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">processor</span> <span class="o">=</span> <span class="n">transforms</span><span class="o">.</span><span class="n">Compose</span><span class="p">([</span>
|
||||
@ -1420,6 +1433,33 @@
|
||||
<span class="n">audio_mask</span> <span class="o">=</span> <span class="n">audio</span><span class="o">.</span><span class="n">new_ones</span><span class="p">(</span><span class="o">*</span><span class="n">audio</span><span class="o">.</span><span class="n">shape</span><span class="p">[:</span><span class="mi">2</span><span class="p">])</span>
|
||||
<span class="n">audio_mask</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="n">pad</span><span class="p">:]</span> <span class="o">=</span> <span class="mi">0</span>
|
||||
<span class="n">other_audio_inputs</span><span class="p">[</span><span class="s1">'attention_mask'</span><span class="p">]</span> <span class="o">=</span> <span class="n">audio_mask</span><span class="o">.</span><span class="n">bool</span><span class="p">()</span>
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">'pixtral'</span><span class="p">:</span>
|
||||
<span class="c1"># Hold on to pixel_values and input_ids.</span>
|
||||
<span class="n">dtype</span> <span class="o">=</span> <span class="n">str_dtype_to_torch</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vision_precision</span><span class="p">)</span>
|
||||
<span class="n">pixel_values</span> <span class="o">=</span> <span class="n">image</span><span class="p">[</span><span class="s2">"pixel_values"</span><span class="p">]</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
|
||||
<span class="n">input_ids</span> <span class="o">=</span> <span class="n">image</span><span class="p">[</span><span class="s2">"input_ids"</span><span class="p">]</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Shape of pixel values from the processor varies with the raw image.</span>
|
||||
<span class="c1"># So we create a new tensor with a fixed shape as expected by the vision</span>
|
||||
<span class="c1"># encoder and create a corresponding attention mask.</span>
|
||||
<span class="n">image_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">image_size</span>
|
||||
<span class="n">patch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">patch_size</span>
|
||||
<span class="n">d_min</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">finfo</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span><span class="o">.</span><span class="n">min</span>
|
||||
<span class="n">num_patches</span> <span class="o">=</span> <span class="p">(</span><span class="n">image_size</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">)</span>
|
||||
<span class="n">image</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">image_size</span><span class="p">,</span> <span class="n">image_size</span><span class="p">),</span>
|
||||
<span class="n">fill_value</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
|
||||
<span class="n">device</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">)</span>
|
||||
<span class="n">attention_mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="n">num_patches</span><span class="p">,</span> <span class="n">num_patches</span><span class="p">),</span>
|
||||
<span class="n">fill_value</span><span class="o">=</span><span class="n">d_min</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
|
||||
<span class="n">device</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">)</span>
|
||||
<span class="n">h</span><span class="p">,</span> <span class="n">w</span> <span class="o">=</span> <span class="n">pixel_values</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">2</span><span class="p">:]</span>
|
||||
<span class="n">image</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="n">h</span><span class="p">,</span> <span class="p">:</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="n">pixel_values</span>
|
||||
<span class="n">attention_mask</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="n">h</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">,</span> <span class="p">:</span><span class="n">w</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
|
||||
<span class="n">other_vision_inputs</span> <span class="o">=</span> <span class="p">{</span>
|
||||
<span class="s2">"attention_mask"</span><span class="p">:</span> <span class="n">attention_mask</span><span class="p">,</span>
|
||||
<span class="p">}</span>
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">'llava_next'</span><span class="p">:</span>
|
||||
<span class="nb">input</span> <span class="o">=</span> <span class="n">image</span>
|
||||
<span class="n">image</span> <span class="o">=</span> <span class="nb">input</span><span class="p">[</span><span class="s1">'pixel_values'</span><span class="p">]</span>
|
||||
@ -1633,6 +1673,17 @@
|
||||
<span class="n">audio_features</span> <span class="o">=</span> <span class="n">audio_features</span><span class="o">.</span><span class="n">unsqueeze</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">repeat</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||||
<span class="n">length</span> <span class="o">=</span> <span class="n">input_ids</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
|
||||
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">'pixtral'</span><span class="p">:</span>
|
||||
<span class="n">relevant_patch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">patch_size</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">spatial_merge_size</span>
|
||||
<span class="n">output_img_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">image_size</span> <span class="o">//</span> <span class="n">relevant_patch_size</span>
|
||||
<span class="n">visual_features</span> <span class="o">=</span> <span class="n">visual_features</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span>
|
||||
<span class="n">output_img_size</span><span class="p">,</span> <span class="n">output_img_size</span><span class="p">,</span>
|
||||
<span class="o">-</span><span class="mi">1</span><span class="p">)[:</span><span class="n">h</span> <span class="o">//</span> <span class="n">relevant_patch_size</span><span class="p">,</span> <span class="p">:</span><span class="n">w</span> <span class="o">//</span>
|
||||
<span class="n">relevant_patch_size</span><span class="p">]</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||||
<span class="n">input_ids</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ptuning_setup_pixtral</span><span class="p">(</span><span class="n">input_ids</span><span class="o">=</span><span class="n">input_ids</span><span class="p">)</span>
|
||||
<span class="n">length</span> <span class="o">=</span> <span class="n">input_ids</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
|
||||
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">'llava_next'</span><span class="p">:</span>
|
||||
<span class="n">visual_features</span> <span class="o">=</span> <span class="n">LlavaNextUtils</span><span class="o">.</span><span class="n">rearrange_image_features</span><span class="p">(</span>
|
||||
<span class="n">visual_features</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">image_newlines</span><span class="p">[</span><span class="s2">"image_newline"</span><span class="p">],</span>
|
||||
@ -1733,7 +1784,7 @@
|
||||
<span class="n">torch</span><span class="o">.</span><span class="n">int32</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="ow">in</span> <span class="p">[</span>
|
||||
<span class="s1">'fuyu'</span><span class="p">,</span> <span class="s1">'kosmos-2'</span><span class="p">,</span> <span class="s1">'phi-3-vision'</span><span class="p">,</span> <span class="s1">'llava_next'</span>
|
||||
<span class="s1">'fuyu'</span><span class="p">,</span> <span class="s1">'kosmos-2'</span><span class="p">,</span> <span class="s1">'phi-3-vision'</span><span class="p">,</span> <span class="s1">'llava_next'</span><span class="p">,</span> <span class="s1">'pixtral'</span>
|
||||
<span class="p">]:</span>
|
||||
<span class="k">return</span> <span class="n">input_ids</span><span class="p">,</span> <span class="n">input_lengths</span><span class="p">,</span> <span class="p">[</span>
|
||||
<span class="n">visual_features</span>
|
||||
@ -2535,6 +2586,23 @@
|
||||
<span class="k">return</span> <span class="n">res_input_ids</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="MultimodalModelRunner.ptuning_setup_pixtral">
|
||||
<a class="viewcode-back" href="../../../python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_pixtral">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">ptuning_setup_pixtral</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">input_ids</span><span class="p">):</span>
|
||||
<span class="c1"># input_ids obtained from processor has token_ids for text as well as image tokens</span>
|
||||
<span class="c1"># where each image token is represented the same image_token_index (10 for this model).</span>
|
||||
<span class="n">image_token_index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">image_token_index</span>
|
||||
<span class="n">vocab_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span>
|
||||
<span class="c1"># Replace all image tokens with a unique token_id > text_vacab_size.</span>
|
||||
<span class="c1"># This shall be used to lookup the prompt table.</span>
|
||||
<span class="n">replacer</span> <span class="o">=</span> <span class="n">vocab_size</span>
|
||||
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">])):</span>
|
||||
<span class="k">if</span> <span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">image_token_index</span><span class="p">:</span>
|
||||
<span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">replacer</span>
|
||||
<span class="n">replacer</span> <span class="o">+=</span> <span class="mi">1</span>
|
||||
<span class="k">return</span> <span class="n">input_ids</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="MultimodalModelRunner.ptuning_setup_llava_next">
|
||||
<a class="viewcode-back" href="../../../python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_llava_next">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">ptuning_setup_llava_next</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">visual_features</span><span class="p">,</span> <span class="n">pre_prompt</span><span class="p">,</span>
|
||||
@ -2918,6 +2986,18 @@
|
||||
<span class="n">audios</span><span class="o">=</span><span class="p">[</span><span class="n">raw_audio</span><span class="p">],</span>
|
||||
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">"pt"</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="s1">'pixtral'</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
|
||||
<span class="c1"># Send image and text prompt to processor.</span>
|
||||
<span class="n">pre_prompt</span> <span class="o">=</span> <span class="s2">"<s>[INST][IMG]"</span>
|
||||
<span class="k">if</span> <span class="n">input_text</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">input_text</span> <span class="o">=</span> <span class="s2">"What is in the image?"</span>
|
||||
<span class="n">post_prompt</span> <span class="o">=</span> <span class="s2">"[/INST]"</span>
|
||||
<span class="n">prompt</span> <span class="o">=</span> <span class="n">pre_prompt</span> <span class="o">+</span> <span class="n">input_text</span> <span class="o">+</span> <span class="n">post_prompt</span>
|
||||
<span class="n">dtype</span> <span class="o">=</span> <span class="n">str_dtype_to_torch</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vision_precision</span><span class="p">)</span>
|
||||
<span class="n">image</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">processor</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">prompt</span><span class="p">,</span>
|
||||
<span class="n">images</span><span class="o">=</span><span class="p">[</span><span class="n">raw_image</span><span class="p">],</span>
|
||||
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">"pt"</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="s1">'internvl'</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
|
||||
<span class="n">pre_prompt</span> <span class="o">=</span> <span class="s2">"<|system|></span><span class="se">\n</span><span class="s2">你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|end|><|user|></span><span class="se">\n</span><span class="s2"><image></span><span class="se">\n</span><span class="s2">"</span>
|
||||
<span class="k">if</span> <span class="n">input_text</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
@ -3102,7 +3182,8 @@
|
||||
<span class="n">post_prompt</span> <span class="o">=</span> <span class="p">[</span><span class="n">post_prompt</span><span class="p">]</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span>
|
||||
<span class="s1">'fuyu'</span><span class="p">,</span> <span class="s1">'pix2struct'</span><span class="p">,</span> <span class="s1">'kosmos-2'</span><span class="p">,</span> <span class="s1">'vila'</span><span class="p">,</span> <span class="s1">'phi-3-vision'</span><span class="p">,</span>
|
||||
<span class="s1">'phi-4-multimodal'</span><span class="p">,</span> <span class="s1">'llava_next'</span><span class="p">,</span> <span class="s1">'internvl'</span><span class="p">,</span> <span class="s1">'llava_onevision'</span>
|
||||
<span class="s1">'phi-4-multimodal'</span><span class="p">,</span> <span class="s1">'llava_next'</span><span class="p">,</span> <span class="s1">'internvl'</span><span class="p">,</span> <span class="s1">'llava_onevision'</span><span class="p">,</span>
|
||||
<span class="s1">'pixtral'</span>
|
||||
<span class="p">]:</span>
|
||||
<span class="k">if</span> <span class="n">image</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">image</span><span class="o">.</span><span class="n">dim</span><span class="p">()</span> <span class="o">==</span> <span class="mi">5</span><span class="p">:</span>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -323,19 +323,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -344,19 +344,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -369,6 +369,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -446,6 +447,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -525,11 +527,13 @@
|
||||
<span class="sd"> regex (str, optional): The generated text is amenable to the user-specified regular expression. Defaults to None.</span>
|
||||
<span class="sd"> grammar (str, optional): The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None.</span>
|
||||
<span class="sd"> json_object (bool): If True, the generated text is amenable to json format. Defaults to False.</span>
|
||||
<span class="sd"> structural_tag (str, optional): The generated text is amenable to the user-specified structural tag. Defaults to None.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">json</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="nb">dict</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">regex</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">grammar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">json_object</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
<span class="n">structural_tag</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="n">num_guides</span> <span class="o">=</span> <span class="mi">0</span>
|
||||
@ -809,6 +813,11 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">_validate</span><span class="p">()</span>
|
||||
|
||||
<span class="c1"># correct types as users might pass in logprob=True for Top-1 logprobs</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span> <span class="ow">and</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">logprobs</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="nb">int</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">prompt_logprobs</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_greedy_decoding</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_beam_search</span>
|
||||
@ -954,7 +963,7 @@
|
||||
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">JSON</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">json_schema</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">json</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json_schema</span><span class="p">,</span> <span class="n">BaseModel</span><span class="p">):</span>
|
||||
<span class="n">json_schema</span> <span class="o">=</span> <span class="n">json_schema</span><span class="o">.</span><span class="n">model_json_schema</span><span class="p">()</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json_schema</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
|
||||
<span class="n">json_schema</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">json_schema</span><span class="p">)</span>
|
||||
@ -968,6 +977,10 @@
|
||||
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
|
||||
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">EBNF_GRAMMAR</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">grammar</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">structural_tag</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="p">(</span>
|
||||
<span class="n">tllme</span><span class="o">.</span><span class="n">GuidedDecodingParams</span><span class="o">.</span><span class="n">GuideType</span><span class="o">.</span><span class="n">STRUCTURAL_TAG</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">guided_decoding</span><span class="o">.</span><span class="n">structural_tag</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="kc">None</span></div>
|
||||
|
||||
|
||||
@ -4,40 +4,22 @@ Executor
|
||||
.. Here are files in the cpp/include/executor
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
cacheCommunicator.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: cacheCommunicator.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: serialization.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
disaggServerUtil.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: disaggServerUtil.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
dataTransceiverState.h
|
||||
______________________
|
||||
|
||||
.. doxygenfile:: dataTransceiverState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tensor.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: tensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
executor.h
|
||||
__________
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: executor.h
|
||||
.. doxygenfile:: serialization.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
types.h
|
||||
@ -46,3 +28,21 @@ _______
|
||||
.. doxygenfile:: types.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
executor.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: executor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
dataTransceiverState.h
|
||||
______________________
|
||||
|
||||
.. doxygenfile:: dataTransceiverState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cacheCommunicator.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: cacheCommunicator.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -4,70 +4,22 @@ Runtime
|
||||
.. Here are files in the cpp/include/runtime
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadModule.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: lookaheadModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decoderState.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: decoderState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
request.h
|
||||
iBuffer.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: request.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCache.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: loraCache.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
bufferManager.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: bufferManager.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
memoryCounters.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: memoryCounters.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcUtils.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tllmLogger.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: tllmLogger.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoder.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptDecoder.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaEvent.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: cudaEvent.h
|
||||
.. doxygenfile:: iBuffer.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
modelConfig.h
|
||||
@ -76,40 +28,112 @@ _____________
|
||||
.. doxygenfile:: modelConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingOutput.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
promptTuningParams.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: promptTuningParams.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
bufferManager.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: bufferManager.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptJsonConfig.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: gptJsonConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCache.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: loraCache.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
rawEngine.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: rawEngine.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoder.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptDecoder.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
medusaModule.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: medusaModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: explicitDraftTokensBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iTensor.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iTensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
common.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: common.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCachePageManagerConfig.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: loraCachePageManagerConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
generationOutput.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: generationOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
generationInput.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: generationInput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
worldConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: worldConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iStatefulGptDecoder.h
|
||||
_____________________
|
||||
loraModule.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: iStatefulGptDecoder.h
|
||||
.. doxygenfile:: loraModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleModule.h
|
||||
_____________
|
||||
speculativeDecodingMode.h
|
||||
_________________________
|
||||
|
||||
.. doxygenfile:: eagleModule.h
|
||||
.. doxygenfile:: speculativeDecodingMode.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaEvent.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: cudaEvent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingInput.h
|
||||
@ -118,10 +142,40 @@ _______________
|
||||
.. doxygenfile:: decodingInput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptJsonConfig.h
|
||||
_______________
|
||||
speculativeDecodingModule.h
|
||||
___________________________
|
||||
|
||||
.. doxygenfile:: gptJsonConfig.h
|
||||
.. doxygenfile:: speculativeDecodingModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iGptDecoderBatched.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: iGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleModule.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: eagleModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tllmLogger.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: tllmLogger.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoderBatched.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: gptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaStream.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: cudaStream.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcNvlsMemory.h
|
||||
@ -136,111 +190,27 @@ ________________
|
||||
.. doxygenfile:: samplingConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoderBatched.h
|
||||
___________________
|
||||
request.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: gptDecoderBatched.h
|
||||
.. doxygenfile:: request.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptSession.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptSession.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraModule.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: loraModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
promptTuningParams.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: promptTuningParams.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingMode.h
|
||||
_________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingMode.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
common.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: common.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
medusaModule.h
|
||||
decoderState.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: medusaModule.h
|
||||
.. doxygenfile:: decoderState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingOutput.h
|
||||
ipcUtils.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
memoryCounters.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaStream.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: cudaStream.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iGptDecoderBatched.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: iGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingModule.h
|
||||
___________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: explicitDraftTokensBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
rawEngine.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: rawEngine.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
statefulGptDecoderBatched.h
|
||||
___________________________
|
||||
|
||||
.. doxygenfile:: statefulGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iTensor.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iTensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iBuffer.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iBuffer.h
|
||||
.. doxygenfile:: memoryCounters.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -173,55 +173,28 @@ value for a given parameter, the vector can be limited to a single element
|
||||
|
||||
***Beam-search***
|
||||
|
||||
| Name in TRT-LLM | Description | Data type | Range of value | Default value | Name in HF |
|
||||
| :-----------------------: | :-----------------------------: | :-----------: | :----------------------: | :-----------------------: | :-----------------: |
|
||||
| `beamWidth` | width for beam-search algorithm | Int | \[0, 1024\] | `0` (disable beam search) | `beam_width` |
|
||||
| `beamSearchDiversityRate` | diversity of generated tokens | List\[Float\] | \[0, $+\infty$\) | `0.0f` | `diversity_penalty` |
|
||||
| `lengthPenalty` | penalize longer sequences | List\[Float\] | \[0, $+\infty$\) | `0.0f` | `length_penalty` |
|
||||
| `earlyStopping` | see description below | List\[Int\] | \($-\infty$, $+\infty$\) | `0` | `early_stopping` |
|
||||
| Name in TRT-LLM | Description | Data type | Range of value | Default value | Name in HF |
|
||||
| :-----------------------: | :-----------------------------: | :-----------------: | :----------------------: | :-----------------------: | :-----------------: |
|
||||
| `beamWidth` | width for beam-search algorithm | Int | \[0, 1024\] | `0` (disable beam search) | `beam_width` |
|
||||
| `beamSearchDiversityRate` | diversity of generated tokens | List\[Float\] | \[0, $+\infty$\) | `0.0f` | `diversity_penalty` |
|
||||
| `lengthPenalty` | penalize longer sequences | List\[Float\] | \[0, $+\infty$\) | `0.0f` | `length_penalty` |
|
||||
| `earlyStopping` | see description below | List\[Int\] | \($-\infty$, $+\infty$\) | `0` | `early_stopping` |
|
||||
| `beamWidthArray` | see description below | List\[List\[Int\]\] | \[0, 1024\] | `` | no |
|
||||
|
||||
* Beam-search algorithm: [beam search](https://en.wikipedia.org/wiki/Beam_search).
|
||||
* Parameter `diversity_penalty` in HF is only used for `diverse beam-search decoding` (or named `Group-Beam-Search`), which is not supported by TRT-LLM yet.
|
||||
* If setting `earlyStopping = 1`, decoding will stop once `beamWidth` finished sentences are generated.
|
||||
* If setting `earlyStopping = 0`, decoding will keep going until no better sentences (with better score) can be generated.
|
||||
* If setting `earlyStopping` to other values, decoding will stop only depending on `lengthlengthPenalty`.
|
||||
* `beamWidthArray` is a list of beam width for each step. Using `beamWidthArray = [20,40,80]` as an example,
|
||||
beam width will be 20 for the first step, 40 for second step, 80 for the later all steps.
|
||||
* The `beamWidth` parameter is a scalar value. It means that in this release of
|
||||
TensorRT-LLM, it is not possible to specify a different width for each input
|
||||
sequence. This limitation is likely to be removed in a future release.
|
||||
|
||||
## The Session
|
||||
|
||||
*The runtime session is deprecated in favor of the {ref}`executor`.
|
||||
It will be removed in a future release of TensorRT-LLM.*
|
||||
|
||||
An example of how to use the `GptSession` to run a GPT-like auto-regressive model can be found in
|
||||
[`cpp/tests/runtime/gptSessionTest.cpp`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tests/runtime/gptSessionTest.cpp).
|
||||
|
||||
### Internal Components
|
||||
|
||||
The `GptSession` class encapsulates two main components. The
|
||||
[`TllmRuntime`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/runtime/tllmRuntime.h) is in charge of the
|
||||
execution of the TensorRT engine. The
|
||||
[`GptDecoder`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/gptDecoder.h)
|
||||
does the generation of the tokens from the logits. The `TllmRuntime` class is
|
||||
an internal component and you are not expected to use that class directly.
|
||||
The `GptDecoder` can be used directly to implement custom generation loop
|
||||
and for use cases that cannot be satisfied by the implementation in
|
||||
`GptSession`.
|
||||
|
||||
## In-flight Batching Support
|
||||
|
||||
In-flight batching is supported using separate decoders per
|
||||
request. The biggest difference compared to using a single decoder is in how
|
||||
the token generation from logits is managed. A batch is split into `batchSize`
|
||||
individual requests and kernels are issued using separated CUDA streams.
|
||||
This behavior may be revisited in a future release to maintain the structure
|
||||
of the batch and improve efficiency.
|
||||
|
||||
## Know Issues and Future Changes
|
||||
|
||||
* In the current release of TensorRT-LLM, the C++ and Python runtimes are two
|
||||
separate software components and the C++ runtime is being more actively
|
||||
developed (with features like in-flight batching). An objective, for a
|
||||
future release, could be to rebuild the Python runtime on top of the C++
|
||||
one.
|
||||
The [`TllmRuntime`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/runtime/tllmRuntime.h) is in charge of the execution of the TensorRT engine.
|
||||
The `TllmRuntime` class is an internal component and you are not expected to use that class directly.
|
||||
The [`GptDecoder`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/gptDecoder.h) generates tokens from the logits.
|
||||
The `GptDecoder` can be used directly to implement a custom generation loop and for use cases that cannot be satisfied by the TRT-LLM implementation.
|
||||
|
||||
@ -33,8 +33,6 @@ parameters: {
|
||||
|
||||
If you are writing your own application using Executor API, you can enable kv cache reuse by including `enableBlockReuse=true` when you create the `KvCacheConfig` object. Note that this is the default, if you wish to disable kv cache reuse, pass `enableBlockReuse=false` instead.
|
||||
|
||||
GptSession is scheduled to be obsoleted and does not support kv cache reuse.
|
||||
|
||||
### Enable kv cache reuse for p-tuning
|
||||
|
||||
When using p-tuning, different requests may use same fake input ids (i.e. prompt ids whose values are larger than vocabulary size). That may lead to incorrect kv cache reuse, since TRT-LLM could not distinguish these requests only by input ids. To enable kv cache reuse for p-tuning correctly, users should provide an extra id (uint64) for each input id. Extra ids for normal input ids (i.e. text token ids) should always be 0, while fake input ids should have extra ids which are larger than 0. Requests using same prompt embeddings should use same extra ids, while requests using different prompt embeddings should use different extra ids.
|
||||
@ -94,5 +92,3 @@ parameters: {
|
||||
```
|
||||
|
||||
If you are writing your own application using Executor API, you can enable offloading to host by including `hostCacheSize=45000000000` when you create the `KvCacheConfig` object. This will create a 45 GiB offloading buffer in host memory.
|
||||
|
||||
GptSession is scheduled to be obsoleted and does not support kv cache block offloading.
|
||||
|
||||
@ -38,18 +38,6 @@ python3 examples/summarize.py \
|
||||
|
||||
```
|
||||
|
||||
We can also benchmark the efficiency of Weight Streaming. Here is an example:
|
||||
```bash
|
||||
python3 benchmarks/python/benchmark.py \
|
||||
--engine_dir /tmp/llama_7b/trt_engines/fp16/1-gpu/ \
|
||||
--batch_size "1;32" \
|
||||
--input_output_len "256,32" \
|
||||
--gpu_weights_percent "0.0;0.3;0.6;1.0" \
|
||||
--dtype float16 \
|
||||
--csv \
|
||||
--log_level verbose
|
||||
```
|
||||
|
||||
|
||||
### API Changes
|
||||
|
||||
|
||||
@ -168,16 +168,16 @@ As a result, even if TensorRT has a powerful pattern-matching algorithm and
|
||||
supports a lot of possible fusions, there is always the risk that it cannot
|
||||
identify uncommon and/or very advanced patterns. To overcome that inevitable
|
||||
limitation, TensorRT offers a powerful mechanism known as
|
||||
[plugins](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Plugin/pyPlugin.html).
|
||||
[plugins](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Plugin/pyPlugin.html).
|
||||
|
||||
The plugins are nodes inserted in the network graph definition that map to user-defined
|
||||
GPU kernels. TensorRT-LLM uses a number of such plugins. They can be found in
|
||||
the [`cpp/tensorrt_llm/plugins`](source:/cpp/tensorrt_llm/plugins) directory.
|
||||
|
||||
Plugins are written in C++ and follow a well-defined interface described in the
|
||||
[Extending TensorRT with Custom Layers](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#extending)
|
||||
[Extending TensorRT with Custom Layers](https://docs.nvidia.com/deeplearning/tensorrt/latest/inference-library/extending-custom-layers.html)
|
||||
section of the TensorRT
|
||||
[Developer Guide](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html).
|
||||
[Developer Guide](https://docs.nvidia.com/deeplearning/tensorrt/latest/index.html).
|
||||
When executed within a TensorRT engine, plugins trigger the execution of
|
||||
their encapsulated GPU kernels. A fairly simple example of plugins is the
|
||||
[`QuantizeTensorPlugin`](source:/cpp/tensorrt_llm/plugins/quantizeTensorPlugin) that
|
||||
|
||||
@ -135,7 +135,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
pytorch_backend_config:
|
||||
enable_overlap_scheduler: true
|
||||
use_cuda_graph: true
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
@ -218,7 +217,6 @@ pytorch_backend_config:
|
||||
- 256
|
||||
- 384
|
||||
print_iter_log: true
|
||||
enable_overlap_scheduler: true
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
|
||||
@ -260,7 +258,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
pytorch_backend_config:
|
||||
enable_overlap_scheduler: true
|
||||
use_cuda_graph: true
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
@ -314,7 +311,6 @@ pytorch_backend_config:
|
||||
use_cuda_graph: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 128
|
||||
enable_overlap_scheduler: true
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
|
||||
@ -329,7 +325,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 \
|
||||
--dataset $YOUR_DATA_PATH \
|
||||
--backend pytorch \
|
||||
--max_batch_size 128 \
|
||||
--max_num_tokens 1127 \
|
||||
--max_num_tokens 1151 \
|
||||
--num_requests 5120 \
|
||||
--concurrency 1024 \
|
||||
--kv_cache_free_gpu_mem_fraction 0.8 \
|
||||
@ -343,13 +339,13 @@ The perf might be different from different datasets and machines
|
||||
===========================================================
|
||||
= PERFORMANCE OVERVIEW
|
||||
===========================================================
|
||||
Request Throughput (req/sec): 5.1532
|
||||
Total Output Throughput (tokens/sec): 10553.8445
|
||||
Per User Output Throughput (tokens/sec/user): 10.4199
|
||||
Per GPU Output Throughput (tokens/sec/gpu): 1319.2306
|
||||
Total Token Throughput (tokens/sec): 15707.0888
|
||||
Total Latency (ms): 993548.8470
|
||||
Average request latency (ms): 197768.0434
|
||||
Request Throughput (req/sec): 5.6100
|
||||
Total Output Throughput (tokens/sec): 11489.2671
|
||||
Per User Output Throughput (tokens/sec/user): 11.3476
|
||||
Per GPU Output Throughput (tokens/sec/gpu): 1436.1584
|
||||
Total Token Throughput (tokens/sec): 17233.9007
|
||||
Total Latency (ms): 912656.9938
|
||||
Average request latency (ms): 181540.5739
|
||||
```
|
||||
|
||||
## Exploring more ISL/OSL combinations
|
||||
|
||||
@ -0,0 +1,266 @@
|
||||
# Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs
|
||||
by NVIDIA TensorRT-LLM team
|
||||
## Table of Contents
|
||||
|
||||
- [Background](#background)
|
||||
- [Implementation Configuration](#implementation-configuration)
|
||||
- [Workload Profile](#workload-profile)
|
||||
- [Model Architecture](#model-architecture)
|
||||
- [Precision Strategy](#precision-strategy)
|
||||
- [Parallelism Strategy](#parallelism-strategy)
|
||||
- [Everything in One Diagram](#everything-in-one-diagram)
|
||||
- [Key Optimizations](#key-optimizations)
|
||||
- [System Level optimizations](#system-level-optimizations)
|
||||
- [CUDA Graph & Programmatic Dependent Launch](#cuda-graph--programmatic-dependent-launch)
|
||||
- [MTP](#mtp)
|
||||
- [Autoregressive MTP Layers](#autoregressive-mtp-layers)
|
||||
- [Relax Acceptance Verification](#relax-acceptance-verification)
|
||||
- [Multi-streams](#multi-streams)
|
||||
- [Sparse Experts as GEMMs](#sparse-experts-as-gemms-only-works-when-moe_backendcutlass)
|
||||
- [Re-balanced the sparse experts](#re-balanced-the-sparse-experts)
|
||||
- [Mixed ETP](#mixed-etp)
|
||||
- [Smart Router](#smart-router)
|
||||
- [Kernel Level optimizations](#kernel-level-optimizations)
|
||||
- [Attention Kernel](#attention-kernel)
|
||||
- [Grouped GEMM](#grouped-gemm)
|
||||
- [CUTLASS Backend](#cutlass-backend-default-backend)
|
||||
- [TRTLLM Backend](#trtllm-backend)
|
||||
- [Communication Kernel](#communication-kernel)
|
||||
- [Dense GEMM optimization](#dense-gemm-optimization)
|
||||
- [Fuse_A_GEMM](#fuse_a_gemm)
|
||||
- [RouterGEMM](#routergemm)
|
||||
- [Kernel fusion](#kernel-fusion)
|
||||
- [How to reproduce](#how-to-reproduce)
|
||||
- [Future Works](#future-works)
|
||||
- [Acknowledgment](#acknowledgment)
|
||||
|
||||
## Background
|
||||
Recent advancements in Large Language Reasoning Models have demonstrated remarkable success, while creating new deployment challenges. A critical challenge emerges from extended Output Sequence Lengths (OSL) due to complex "thinking and reasoning" processes. Longer OSL demands stricter Token-to-Token Latency (TTL) requirements, often forcing concurrency limitations. The most extreme case, single concurrency (min-latency scenario) , becomes particularly challenging for real-time applications.
|
||||
|
||||
This article explores how TensorRT-LLM achieves record-breaking performance for [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) in min-latency scenarios on NVIDIA's 8×B200 GPU configuration progressing from 67 tokens per second (TPS) to 253 before GTC 2025(**3.7x** speed-up), and to our current number is 368 TPS (**5.5x** speed-up).
|
||||
|
||||
|
||||
## Implementation Configuration
|
||||
|
||||
### Workload Profile
|
||||
Input Sequence Length (ISL): 1k tokens
|
||||
|
||||
Output Sequence Length (OSL): 2k tokens
|
||||
|
||||
### Model Architecture
|
||||
The base DeepSeek-R1 main model contains: 3x dense layers (initial) and 58x MoE layers, there is also 1x Multi-Tokens Prediction (MTP) layer (MoE-architecture equivalent) for speculative decoding. Our optimized configuration extends the MTP layer to 3x layers using autoregressive styling for peak performance exploration.
|
||||
|
||||
<img src="../media/tech_blog1_model_overview.png?raw=true" alt="tech_blog1_model_overview" width="500" height="auto">
|
||||
|
||||
### Precision Strategy
|
||||
We have explored a mixed precision recipe, which provides a better tradeoff between accuracy and performance.
|
||||
|
||||
| Component | Precision |
|
||||
|:-------------------------------------:|:---------:|
|
||||
| 64x Attention Modules | bf16* |
|
||||
| 3x Dense FFN Layers | nvfp4** |
|
||||
| 58x MoE FFN Layers | nvfp4 |
|
||||
| 3x MTP Layers | bf16 |
|
||||
| RouterGEMM*** | bf16 |
|
||||
|
||||
*TensorRT-LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.
|
||||
|
||||
** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
|
||||
*** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability
|
||||
|
||||
|
||||
### Parallelism Strategy
|
||||
We have also explored and introduced mixed parallel strategy on 8xB200 GPUs. Specifically, the best strategy for this latency scenario is 'TP8EP2', the definition represents
|
||||
|
||||
| Component | Parallelism Patterns |
|
||||
|:---------------------:|:--------------------------------------------------------:|
|
||||
| Attention Modules | Tensor Parallelism 8 (TP8) |
|
||||
| MoE Sparse Experts | Mixed TP4 with Expert Parallelism 2 (EP2) |
|
||||
| MoE Shared Experts | TP8 |
|
||||
| Fuse_A GEMM | Data Parallelism 8 (DP8) |
|
||||
| RouterGEMM | DP8 |
|
||||
|
||||
### Everything in One Diagram
|
||||
Now let's put everything into one diagram, which represents a MoE layer from a decoding iteration.
|
||||
|
||||
<img src="../media/tech_blog1_model_details.png?raw=true" alt="tech_blog1_model_details" width="1600" height="auto">
|
||||
|
||||
|
||||
The modules in the diagram are:
|
||||
|
||||
- Input Module: A BF16 tensor with shape [m, 7168], where m is the number of tokens (for instance, m = 4 when using three MTP layers), and 7168 is the model's hidden size.
|
||||
|
||||
- Module1: Fuse_A_GEMM Concatenates the weights for [WDQ, WDKV, and WKR](https://arxiv.org/pdf/2412.19437) to reduce kernel launch overhead.
|
||||
|
||||
- Module2: 2× RMSNorm Performs normalization for Q/K tensors. These can be either overlapped on multiple streams or fused into a single grouped RMSNorm.
|
||||
|
||||
- Module3: UQ_QR_GEMM Concatenates WUQ and WQR weights to reduce kernel launch overhead.
|
||||
|
||||
- Module4: UK_BGEMM Uses WUK in a batched GEMM. We avoid absorbing Modules 3 and 4 to prevent weight-size inflation and extra loading costs.
|
||||
|
||||
- Module5: Concat KVCache & applyRope Merges K/V cache and applies ROPE (Rotary Positional Encoding).
|
||||
|
||||
- Module6: genAttention Performs MLA during generation, acting like an MQA with num_q_heads = 128 / TP8 = 16.
|
||||
|
||||
- Module7: UV_GEMM Executes a batched GEMM with WUV weights.
|
||||
|
||||
- Module8: WO_GEMM Runs a dense GEMM using WO weights. We do not absorb Modules 7 and 8 to avoid increased weight loading overhead.
|
||||
|
||||
- Module9: Fused Kernels Incorporates oneshotAllReduce, Add_RMSNorm, and DynamicQuant (BF16->NVFP4) in a single kernel.
|
||||
|
||||
- Module10: routerGEMM & topK Handles the router GEMM and topK selection.
|
||||
|
||||
- Module11: Shared Expert Overlaps partially with Module10 and Module 12.
|
||||
|
||||
- Module12: Sparse Experts Implements expert layers via grouped GEMM.
|
||||
|
||||
- Module13: Final Fused Kernels Performs localReduction, oneshotAllReduce, and Add_RMSNorm operations together.
|
||||
|
||||
## Key Optimizations
|
||||
| Feature | TPS/User | Code Links / Notes |
|
||||
|:----------------------------------------------------------|:--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| Baseline: CUDA Graph + EP8TP8 | 67 | [modeling_deepseekv3.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py) |
|
||||
| Multi Stream to overlap shared expert with sparse experts | 73 | [modeling_deepseekv3.py#L506](https://github.com/NVIDIA/TensorRT-LLM/blob/14bfb5e0d6e81aec3306a1324cf074566646f886/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L506) |
|
||||
| Optimize MLA Kernel | 80 | [PR #3763](https://github.com/NVIDIA/TensorRT-LLM/pull/3763) |
|
||||
| Optimize TopK Kernels | 84 | • [RoutingKernel.cu](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.cu)<br/>• [noAuxTcKernels.cu](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu) |
|
||||
| Optimize Fuse_A_GEMM | 89 | [attention.py#L345](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/tensorrt_llm/_torch/modules/attention.py#L345) |
|
||||
| MTP3_Vanilla | 154 | evolve to MTP3_Autoregressive |
|
||||
| Evolve to MTP3_Autoregressive + Optimize Router GEMM | 164 | [modeling_deepseekv3.py#L304](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L304) |
|
||||
| Fuse oneshotAR + RMSNorm | 168 | [allReduceFusionKernels.cu#L440](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu#L440) |
|
||||
| Enable PDL | 173 | Set environment variable: `export TRTLLM_ENABLE_PDL=1` |
|
||||
| Multi-stream to overlap two RMS_norms | 180 | [attention.py#L546](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/tensorrt_llm/_torch/modules/attention.py#L546) |
|
||||
| MTP3_Autoregressive | 204 | [modeling_deepseekv3.py#L823](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L823) |
|
||||
| Finetune clock/power | 211 | `sudo nvidia-smi -pm 0; sudo nvidia-smi -pm 1; sudo nvidia-smi boost-slider --vboost 4` |
|
||||
| Optimize CUTLASS Grouped GEMM Kernels | 236 | The code is not open-source yet due to the dependency with internal base environment and we are planning to make it decoupled from internal base environment thus to be able to open-source in the future.|
|
||||
| Optimize CUTLASS Flow: Sparse Experts as GEMMs | 249 | The code is not open-source yet due to the dependency with internal base environment and we are planning to make it decoupled from internal base environment thus to be able to open-source in the future.|
|
||||
| Introduce EP4TP2 for better workload balance | 253 | Use `--tp 8 --ep 4` when benchmarking |
|
||||
| Introduce moe_backend=TRTLLM, EP2TP4 for better balance | 299 | [PR #4280](https://github.com/NVIDIA/TensorRT-LLM/pull/4280) |
|
||||
| Optimize Fuse_A_GEMM and Router_GEMM | 340 | WIP: [PR #4115](https://github.com/NVIDIA/TensorRT-LLM/pull/4115) |
|
||||
| Relax Acceptance | **368** | [deepseek_v3#multi-token-prediction-mtp](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#multi-token-prediction-mtp) |
|
||||
|
||||
### System Level optimizations
|
||||
#### CUDA Graph & Programmatic Dependent Launch
|
||||
[CUDA Graph](https://developer.nvidia.com/blog/cuda-graphs/) is necessary to overcome the CPU-overhead for small workloads, while [Programmatic Dependent Launch](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=Programmatic%2520Dependent%2520Launch#programmatic-dependent-launch-and-synchronization) can be used to reduce the kernel launch latency furthermore.
|
||||
#### MTP
|
||||
There are two optimizations based on MTP
|
||||
##### Autoregressive MTP Layers
|
||||
|
||||
| Version | Acceptance Rate | TPS/User | TPS/User Speedup |
|
||||
|:-----------:|:---------------:|:--------:|:----------------:|
|
||||
| Without MTP | 1.00 | 111 | 1.00 |
|
||||
| MTP 1 | 1.92 | 198 | 1.78 |
|
||||
| MTP 2 | 2.58 | 250 | 2.25 |
|
||||
| MTP 3 | 2.82 | 253 | 2.28 |
|
||||
| MTP 4 | 2.99 | 245 | 2.21 |
|
||||
| MTP 5 | 3.01 | 239 | 2.15 |
|
||||
|
||||
Based on our exploration, 3x MTP layers configuration demonstrates optimal performance.
|
||||
|
||||
##### Relax Acceptance Verification
|
||||
For the reasoning model (such as DeepSeek R1), the generation may consist of two phases: thinking phase and actual output. During the thinking phase, when relaxed acceptance is enabled, the draft token can be accepted when it is in a candidate set. This candidate is generated based on the logits topN and probability threshold.
|
||||
- topN: The topN tokens are sampled from logits.
|
||||
- Probability threshold. Based on topN candidates, only those tokens with a probability greater than the Top1's probability - delta can remain in the candidate set.
|
||||
|
||||
During the non-thinking phase, we still use strict acceptance.
|
||||
|
||||
| Version | Acceptance Rate | TPS/User Speedup |
|
||||
|:------------------:|:--------------:|:----------------:|
|
||||
| MTP3_top1, d0.0 | 2.82 | 1.00 |
|
||||
| MTP3_top10, d0.5 | 3.06 | 1.08 |
|
||||
| MTP3_top10, d0.6 | 3.10 | 1.09 |
|
||||
| MTP3_top15, d0.5 | 3.07 | 1.08 |
|
||||
|
||||
This is a relaxed way of verification and comparison, which can improve the acceptance rate and bring positive speedup with limited influence on accuracy.
|
||||
|
||||
| Dataset | Test Size | w/o Relaxed accuracy | w/ Relaxed accuracy |
|
||||
|:-------------------------:|:---------:|:----------:|:----------:|
|
||||
| MMLU-Pro | 12,032 | 84.0% | 81.2% |
|
||||
| Humanity's Last Exam | 2,684 | 9.0% | 9.0% |
|
||||
| GPQA Diamond | 198 | 71.0% | 69.2% |
|
||||
| MATH-500 | 500 | 96.0% | 96.2% |
|
||||
| AIME 2024 | 30 | 68.0% | 74.0% |
|
||||
| SciCode | 338 | 36.0% | 39.0% |
|
||||
| LiveCodeBench | 315 | 62.0% | 66.0% |
|
||||
|
||||
For more information, please visit [multi-token-prediction-mtp](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#multi-token-prediction-mtp)
|
||||
|
||||
|
||||
|
||||
#### Multi-streams
|
||||
We have introduced multi-streams based optimizations to hide some kernels' overhead, such as:
|
||||
- Overlap shared experts with sparse experts
|
||||
- Overlap Concat_KVCache kernel with GEMM
|
||||
|
||||
|
||||
#### Sparse Experts as GEMMs (only works when moe_backend=CUTLASS)
|
||||
|
||||
<img src="../media/tech_blog1_sparse_exp_as_a_gemm.png?raw=true" alt="tech_blog1_sparse_exp_as_a_gemm" width="800" height="auto">
|
||||
|
||||
The existing CUTLASS-based Sparse Experts flow (illustrated in the figure) dispatches input tokens to their designated experts, then applies indexed local reduction on each expert's outputs before a global allreduce. Both dispatching and indexed local reduction incur high overhead in low-latency scenarios. To address this, we propose treating "Sparse Experts as GEMMs" by sending all tokens to each activated expert and masking out unneeded outputs before local reduction. Because grouped GEMMs are memory-bound, the extra computations from redundant tokens have minimal impact, effectively eliminating the costly dispatch and reduction overhead.
|
||||
|
||||
#### Re-balanced the sparse experts
|
||||
For sparse experts, two parallelization strategies are commonly used: Expert Parallel (EP) and Tensor Parallel (TP). Expert Parallel (EP) maps each expert to a distinct GPU, achieving high memory and computational efficiency. However, token placement is data-dependent, distributing workloads unevenly across GPUs and revealing overhead in the AllReduce step after the MoE module. Tensor Parallel (TP) shards each expert evenly across GPUs, creating a balanced workload but sacrificing math/memory efficiency.
|
||||
|
||||
|
||||
##### Mixed ETP
|
||||
A combined EP/TP approach can mitigate both challenges. In practice, our experiments show that a configuration of TP4EP2 offers the best performance.
|
||||
|
||||
##### Smart Router
|
||||
Alternatively, by storing all expert weights on a cluster of four GPUs and replicating them to another four-GPU cluster, a smart router can dynamically dispatch tokens across each cluster. This design keeps balanced workload distribution even without significantly impacting local memory and computation efficiency.
|
||||
|
||||
|
||||
### Kernel Level optimizations
|
||||
#### Attention Kernel
|
||||
We have developed a customized MLA attention kernel to better utilize GPU resources for latency scenarios.
|
||||
#### Grouped GEMM
|
||||
##### CUTLASS Backend (default backend)
|
||||
Our default MoE backend is based on CUTLASS, which is flexible/robust but may not be the best performance case.
|
||||
|
||||
##### TRTLLM Backend
|
||||
The other MoE backend is TRTLLM, which provides better performance, and we are working to make it more flexible and robust, and in the future it will be switched as the default backend for Grouped GEMM computation for latency scenarios.
|
||||
|
||||
#### Communication Kernel
|
||||
For small message sizes, regular NCCL latency-bound AllReduce kernels are inefficient, so we've developed a customized oneshot AllReduce kernel. It leverages the powerful NVSwitch HW capability by acting like an initial broadcast followed by local reduction, delivering better performance in min-latency scenarios.
|
||||
|
||||
#### Dense GEMM optimization
|
||||
We focus on optimizing two kinds of dense GEMMs: Fuse_A_GEMM and RouterGEMM, because they dominate the execution time, suffer from low memory efficiency, and cannot be easily sharded (they are DP-based).
|
||||
|
||||
##### Fuse_A_GEMM
|
||||
We developed a custom Fuse_A_GEMM that prefetches the majority of its weights into shared memory (enabled by PDL and overlapped with oneshot-AllReduce), significantly enhancing performance. The kernel shows substantial improvements over default GEMM implementation when num_tokens < 16.
|
||||
|
||||
<img src="../media/tech_blog1_fuse_a_gemm.png?raw=true" alt="tech_blog1_fuse_a_gemm" width="500" height="auto">
|
||||
|
||||
##### RouterGEMM
|
||||
By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when [num_tokens <=30](https://github.com/NVIDIA/TensorRT-LLM/pull/4115/files#diff-006ae982200a5ef2b27f4aedb526025e64406d3c2fadde329ea745793fac04edR303:~:text=and%20hidden_states.-,size,-(0))
|
||||
|
||||
<img src="../media/tech_blog1_router_gemm.png?raw=true" alt="tech_blog1_router_gemm" width="500" height="auto">
|
||||
|
||||
#### Kernel fusion
|
||||
Kernel fusion is necessary for min-latency scenario to reduce extra global memory write/read cost, and we support following fusion patterns now
|
||||
- Fuse two overlapped RMS_Norms into one GroupedRMSNorm
|
||||
- Fuse (LocalReduction) + AR+ RMS_Norm+ (Dynamic_Quant_bf16tonvfp4) into one kernel
|
||||
- Fuse Grouped GEMM_FC1 + dot activation (when moe_backend=TRTLLM) into one kernel
|
||||
|
||||
|
||||
|
||||
## How to reproduce
|
||||
https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md#b200-min-latency
|
||||
|
||||
Of note, the Relaxed Acceptance is specific for Deepseek-R1 model, if you want to enable it, you need to set `add_generation_prompt = True` when preparing the benchmark dataset, the code demo likes
|
||||
```python
|
||||
input_ids = tokenizer.encode(tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True), add_special_tokens=False)
|
||||
```
|
||||
It's also needed to set `use_relaxed_acceptance_for_thinking: true`, `relaxed_topk: 10` and `relaxed_delta: 0.6` in speculative_config.
|
||||
|
||||
|
||||
## Future Works
|
||||
- More Fusions
|
||||
- More Overlap
|
||||
- More optimization of Attention Kernel
|
||||
- More Exploration of MTP
|
||||
|
||||
## Acknowledgment
|
||||
Pushing the performance boundaries of DeepSeek R1 for latency-sensitive applications has been a remarkable engineering journey. The optimizations detailed in this post represent an exceptional cross-functional collaboration across the entire AI technology stack - spanning kernel-level optimizations, runtime enhancements, model quantization techniques, algorithmic improvements, and systematic performance analysis and tuning. While we can't individually acknowledge every contributor, we're proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in TensorRT-LLM performance engineering.
|
||||
|
||||
Through this collaborative endeavor, we've developed valuable insights into maximizing GPU utilization for large language model inference. We hope that the techniques and best practices shared in this blog will empower the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.
|
||||
@ -78,6 +78,7 @@ First, create a configuration file:
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
EOF
|
||||
|
||||
Then, start the server with the configuration file:
|
||||
|
||||
@ -0,0 +1,10 @@
|
||||
Genai Perf Client For Multimodal
|
||||
================================
|
||||
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/serve/genai_perf_client_for_multimodal.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/genai_perf_client_for_multimodal.sh
|
||||
:language: bash
|
||||
:linenos:
|
||||
@ -14,19 +14,19 @@ The LLM API can be used for both offline or online usage. See more examples of t
|
||||
:maxdepth: 1
|
||||
:caption: LLM API Examples
|
||||
|
||||
llm_inference_async
|
||||
llm_inference_kv_events
|
||||
llm_inference_customize
|
||||
llm_lookahead_decoding
|
||||
llm_medusa_decoding
|
||||
llm_guided_decoding
|
||||
llm_logits_processor
|
||||
llm_quantization
|
||||
llm_inference
|
||||
llm_multilora
|
||||
llm_inference_async_streaming
|
||||
llm_inference_distributed
|
||||
llm_eagle_decoding
|
||||
llm_inference_async
|
||||
llm_inference_distributed
|
||||
llm_logits_processor
|
||||
llm_inference_kv_events
|
||||
llm_lookahead_decoding
|
||||
llm_quantization
|
||||
llm_inference_async_streaming
|
||||
llm_guided_decoding
|
||||
llm_inference
|
||||
llm_inference_customize
|
||||
llm_auto_parallel
|
||||
llm_mgmn_llm_distributed
|
||||
llm_mgmn_trtllm_bench
|
||||
|
||||
@ -5,19 +5,19 @@ LLM Examples
|
||||
:maxdepth: 2
|
||||
:caption: Scripts
|
||||
|
||||
llm_inference_async
|
||||
llm_inference_kv_events
|
||||
llm_inference_customize
|
||||
llm_lookahead_decoding
|
||||
llm_medusa_decoding
|
||||
llm_guided_decoding
|
||||
llm_logits_processor
|
||||
llm_quantization
|
||||
llm_inference
|
||||
llm_multilora
|
||||
llm_inference_async_streaming
|
||||
llm_inference_distributed
|
||||
llm_eagle_decoding
|
||||
llm_inference_async
|
||||
llm_inference_distributed
|
||||
llm_logits_processor
|
||||
llm_inference_kv_events
|
||||
llm_lookahead_decoding
|
||||
llm_quantization
|
||||
llm_inference_async_streaming
|
||||
llm_guided_decoding
|
||||
llm_inference
|
||||
llm_inference_customize
|
||||
llm_auto_parallel
|
||||
llm_mgmn_llm_distributed
|
||||
llm_mgmn_trtllm_bench
|
||||
|
||||
@ -10,6 +10,7 @@ Online Serving Examples
|
||||
curl_completion_client
|
||||
deepseek_r1_reasoning_parser
|
||||
genai_perf_client
|
||||
genai_perf_client_for_multimodal
|
||||
openai_chat_client
|
||||
openai_chat_client_for_multimodal
|
||||
openai_completion_client
|
||||
|
||||
@ -143,6 +143,7 @@ Welcome to TensorRT-LLM's Documentation!
|
||||
blogs/Falcon180B-H200.md
|
||||
blogs/quantization-in-TRT-LLM.md
|
||||
blogs/XQA-kernel.md
|
||||
blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
|
||||
|
||||
|
||||
Indices and tables
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
|
||||
# Building from Source Code on Linux
|
||||
|
||||
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source code is necessary if you want the best performance or debugging capabilities, or if the [GNU C++11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) is required.
|
||||
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0, which uses the new CXX11 ABI.
|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@ -149,7 +150,7 @@ Refer to the {ref}`support-matrix-hardware` section for a list of architectures.
|
||||
|
||||
#### Building the Python Bindings for the C++ Runtime
|
||||
|
||||
The C++ Runtime, in particular, `GptSession` can be exposed to Python via bindings. This feature can be turned on through the default build options.
|
||||
The C++ Runtime can be exposed to Python via bindings. This feature can be turned on through the default build options.
|
||||
|
||||
```bash
|
||||
python3 ./scripts/build_wheel.py
|
||||
@ -169,8 +170,7 @@ The `build_wheel.py` script will also compile the library containing the C++ run
|
||||
python3 ./scripts/build_wheel.py --cuda_architectures "80-real;86-real" --cpp_only --clean
|
||||
```
|
||||
|
||||
This is particularly useful to avoid linking problems which may be introduced by particular versions of `torch` related to the [dual ABI support of GCC](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html). The option `--clean` will remove the build directory before building. The default build directory is `cpp/build`, which may be overridden using the option
|
||||
`--build_dir`. Run `build_wheel.py --help` for an overview of all supported options.
|
||||
This is particularly useful for avoiding linking issues that may arise with older versions of `torch` (prior to 2.7.0) due to the [Dual ABI support in GCC](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html). The `--clean` option removes the build directory before starting a new build. By default, TensorRT-LLM uses `cpp/build` as the build directory, but you can specify a different location with the `--build_dir` option. For a complete list of available build options, run `python3 ./scripts/build_wheel.py --help`.
|
||||
|
||||
The shared library can be found in the following location:
|
||||
|
||||
@ -217,6 +217,4 @@ TRTLLM_PRECOMPILED_LOCATION=https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.
|
||||
|
||||
#### Known Limitations
|
||||
|
||||
Currently, our released TensorRT-LLM wheel packages are linked against public PyTorch hosted on PyPI, which disables C++11 ABI support. However, the Docker image built previously is based on an NGC container where PyTorch has C++11 ABI enabled; see [NGC PyTorch container page](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch). Therefore, we recommend performing a full build inside this container.
|
||||
|
||||
When using `TRTLLM_PRECOMPILED_LOCATION`, ensure that your wheel is compiled based on the same version of C++ code as your current directory; any discrepancies may lead to compatibility issues.
|
||||
|
||||
@ -5,12 +5,12 @@
|
||||
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
|
||||
|
||||
```bash
|
||||
pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
|
||||
pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
|
||||
```
|
||||
|
||||
If using the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) image, the prerequisite step for installing CUDA-enabled PyTorch package is not required.
|
||||
If using the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) image, the prerequisite steps for installing CUDA-enabled PyTorch package and `libopenmpi-dev` are not required.
|
||||
|
||||
2. Sanity check the installation by running the following in Python (tested on Python 3.12):
|
||||
|
||||
|
||||
@ -5,9 +5,15 @@
|
||||
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
|
||||
|
||||
```bash
|
||||
(Optional) pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
|
||||
```
|
||||
|
||||
PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs. On prior GPUs, this extra installation is not required.
|
||||
|
||||
If using the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) image, the prerequisite steps for installing NVIDIA Blackwell-enabled PyTorch package and `libopenmpi-dev` are not required.
|
||||
|
||||
2. Sanity check the installation by running the following in Python (tested on Python 3.12):
|
||||
|
||||
```{literalinclude} ../../../examples/llm-api/quickstart_example.py
|
||||
@ -19,15 +25,7 @@
|
||||
|
||||
There are some known limitations when you pip install pre-built TensorRT-LLM wheel package.
|
||||
|
||||
1. C++11 ABI
|
||||
|
||||
The pre-built TensorRT-LLM wheel has linked against the public pytorch hosted on pypi, which turned off C++11 ABI.
|
||||
While the NVIDIA optimized pytorch inside NGC container nvcr.io/nvidia/pytorch:xx.xx-py3 turned on the C++11 ABI,
|
||||
see [NGC pytorch container page](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) .
|
||||
Thus we recommend users to build from source inside when using the NGC pytorch container. Build from source guideline can be found in
|
||||
[Build from Source Code on Linux](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html)
|
||||
|
||||
2. MPI in the Slurm environment
|
||||
1. MPI in the Slurm environment
|
||||
|
||||
If you encounter an error while running TensorRT-LLM in a Slurm-managed cluster, you need to reconfigure the MPI installation to work with Slurm.
|
||||
The setup methods depends on your slurm configuration, pls check with your admin. This is not a TensorRT-LLM specific, rather a general mpi+slurm issue.
|
||||
@ -38,7 +36,7 @@ There are some known limitations when you pip install pre-built TensorRT-LLM whe
|
||||
to discover a SLURM installation in the usual places.
|
||||
```
|
||||
|
||||
3. CUDA Toolkit
|
||||
2. CUDA Toolkit
|
||||
|
||||
`pip install tensorrt-llm` won't install CUDA toolkit in your system, and the CUDA Toolkit is not required if want to just deploy a TensorRT-LLM engine.
|
||||
TensorRT-LLM uses the [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/) to quantize a model, while the ModelOpt requires CUDA toolkit to jit compile certain kernels which is not included in the pytorch to do quantization effectively.
|
||||
@ -49,4 +47,18 @@ There are some known limitations when you pip install pre-built TensorRT-LLM whe
|
||||
UserWarning: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
|
||||
Unable to load extension modelopt_cuda_ext and falling back to CPU version.
|
||||
```
|
||||
The installation of CUDA toolkit can be found in [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/)
|
||||
The installation of CUDA toolkit can be found in [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/).
|
||||
|
||||
3. Install inside the PyTorch NGC Container
|
||||
|
||||
The PyTorch NGC Container may lock Python package versions via the `/etc/pip/constraint.txt` file. When installing the pre-built TensorRT-LLM wheel inside the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch), you need to clear this file first.
|
||||
|
||||
```bash
|
||||
[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt
|
||||
```
|
||||
|
||||
PyTorch NGC Container typically includes a pre-installed `tensorrt` Python package. If there is a version mismatch between this pre-installed package and the version required by the TensorRT-LLM wheel, you will need to uninstall the existing `tensorrt` package before installing TensorRT-LLM.
|
||||
|
||||
```bash
|
||||
pip uninstall -y tensorrt
|
||||
```
|
||||
|
||||
@ -1,165 +1,128 @@
|
||||
(perf-overview)=
|
||||
|
||||
> [!IMPORTANT]
|
||||
> As of TensorRT-LLM v0.10, these performance benchmarks have changed methodology to utilize in-flight batching and
|
||||
no longer utilize static benchmarking. These numbers are initial measurements and are expected to improve in future
|
||||
releases.
|
||||
|
||||
# Overview
|
||||
|
||||
This document summarizes performance measurements of TensorRT-LLM on a number of GPUs across a set of key models.
|
||||
|
||||
The data in the following tables is provided as a reference point to help users
|
||||
validate observed performance. It should not be considered as the peak
|
||||
performance that can be delivered by TensorRT-LLM.
|
||||
The data in the following tables is provided as a reference point to help users validate observed performance.
|
||||
It should *not* be considered as the peak performance that can be delivered by TensorRT-LLM.
|
||||
|
||||
## Known Issues
|
||||
We attempted to keep commands as simple as possible to ease reproducibility and left many options at their default settings.
|
||||
Tuning batch sizes, parallelism configurations, and other options may lead to improved performance depending on your situaiton.
|
||||
|
||||
The following issues are being addressed to improve the efficiency of TensorRT-LLM.
|
||||
|
||||
### Known AllReduce performance issue on AMD-based CPU platforms
|
||||
|
||||
We observed a performance issue on NCCL 2.23.4, which can be workarounded by setting `NCCL_P2P_LEVEL` to `SYS`:
|
||||
```
|
||||
export NCCL_P2P_LEVEL=SYS
|
||||
```
|
||||
Multi-GPU cases could be affected due to the issue, which is being addressed.
|
||||
|
||||
### Fused Matmul + Gated-SiLU (LLaMA)
|
||||
|
||||
The current implementation combines two Matmul operations into one Matmul followed by
|
||||
a separate SwiGLU kernel (when `--use_fused_mlp=enable` is enabled). There is also a more
|
||||
efficient implementation that runs single Matmul + SwiGLU fused kernel for FP8 on Hopper
|
||||
(when `--use_fused_mlp=enable --gemm_swiglu_plugin fp8` is enabled). The gemm_swiglu_plugin
|
||||
will support more data types and GPU architectures in the future release.
|
||||
|
||||
### Use *gptManagerBenchmark* for GH200
|
||||
|
||||
For release v0.17, on GH200 systems, we recommend using the legacy flow based on *gptManagerBenchmark* to measure performance.
|
||||
For DeepSeek R1 performance, please check out our [performance guide](../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md)
|
||||
|
||||
## Throughput Measurements
|
||||
|
||||
The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
|
||||
and shows the throughput client-server scenario under maximum load.
|
||||
and shows the throughput scenario under maximum load. The reported metric is `Total Output Throughput (tokens/sec)`.
|
||||
|
||||
The performance numbers below were collected using the steps described in this document.
|
||||
|
||||
**All data in the table below was generated using version 0.17 and presents token throughput in tokens/second.**
|
||||
Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
|
||||
|
||||
| Throughput (tokens / sec) | | GPU | H200 141GB HBM3 | H100 80GB HBM3 | GH200 480GB | L40S | A100-SXM4-80GB |
|
||||
|:----------------------------|:---------------------------|:-----------------------------|:------------------|:-----------------|:--------------|:--------|:-----------------|
|
||||
| | Precision | | FP8 | FP8 | FP8 | FP8 | FP16 |
|
||||
| Model | Tensor Model Parallel Size | Runtime Input/Output Lengths | | | | | |
|
||||
| LLaMA v3.1 8B | 1 | 128, 128 | 29526.04 | 28836.77 | 29852.96 | 9104.61 | 6627.27 |
|
||||
| | | 128, 2048 | 25398.86 | 21109.38 | 21769.55 | 5365.81 | 5255.99 |
|
||||
| | | 128, 4096 | 17370.8 | 13593.65 | 14189.89 | 3025.92 | 3453.79 |
|
||||
| | | 500, 2000 | 21020.81 | 16500.69 | 17137.29 | 4273.75 | 4276.58 |
|
||||
| | | 1000, 1000 | 17537.96 | 15244.78 | 16482.77 | 4054.71 | 3786.83 |
|
||||
| | | 2048, 128 | 3794.14 | 3556.73 | 3843.95 | 1066.52 | 799.61 |
|
||||
| | | 2048, 2048 | 11968.5 | 9488.42 | 10265.9 | 2225.27 | 2424.16 |
|
||||
| | | 5000, 500 | 3987.79 | 3559.36 | 3932.58 | 981.2 | 825.13 |
|
||||
| | | 20000, 2000 | 1804.1 | 1401.31 | 1560.2 | 327.97 | 330.04 |
|
||||
| LLaMA v3.1 70B | 1 | 128, 128 | 4020.75 | 3378.03 | 3636.91 | | |
|
||||
| | | 128, 2048 | 4165.68 | 911.62 | 2082.74 | | |
|
||||
| | | 128, 4096 | 2651.75 | 426.32 | 1263.98 | | |
|
||||
| | | 500, 2000 | 3018.39 | 775.57 | 1973.86 | | |
|
||||
| | | 1000, 1000 | 2823.45 | 839.97 | 1746.12 | | |
|
||||
| | | 2048, 128 | 465.99 | 343.29 | 424.96 | | |
|
||||
| | | 2048, 2048 | 1913.8 | | 1086.93 | | |
|
||||
| | | 5000, 500 | 560.16 | 245.34 | 422.36 | | |
|
||||
| | | 20000, 2000 | 279.52 | | | | |
|
||||
| | 2 | 128, 128 | 6823.01 | 6645.12 | | | 1313.96 |
|
||||
| | | 128, 2048 | 8290.35 | 6169.58 | | | 531.26 |
|
||||
| | | 128, 4096 | 6526.67 | 3897.06 | | | |
|
||||
| | | 500, 2000 | 6848.02 | 4972.57 | | | 439.41 |
|
||||
| | | 1000, 1000 | 5164.76 | 4390.53 | | | 472.94 |
|
||||
| | | 2048, 128 | 809 | 772.66 | | | 148.96 |
|
||||
| | | 2048, 2048 | 4183.88 | 2898.16 | | | 261.1 |
|
||||
| | | 5000, 500 | 1025.38 | 919.73 | | | 121.47 |
|
||||
| | | 20000, 2000 | 640.62 | 443.01 | | | |
|
||||
| | 4 | 128, 128 | 11098.63 | 11127.53 | | 1523.52 | 2733.48 |
|
||||
| | | 128, 2048 | 14156 | 11511.93 | | 1942.66 | 2811.27 |
|
||||
| | | 128, 4096 | 10574.06 | 7439.41 | | 1440.23 | 1976.49 |
|
||||
| | | 500, 2000 | 12452.79 | 9836.7 | | 1634.72 | 2275.79 |
|
||||
| | | 1000, 1000 | 8911.29 | 7430.99 | | 1209.25 | 1921.77 |
|
||||
| | | 2048, 128 | 1358.06 | 1302.6 | | 177.72 | 325.15 |
|
||||
| | | 2048, 2048 | 7130.44 | 5480.03 | | 969.68 | 1393.64 |
|
||||
| | | 5000, 500 | 1811.55 | 1602.78 | | 249.52 | 392.62 |
|
||||
| | | 20000, 2000 | 1199.68 | 920.19 | | 162.25 | 212.08 |
|
||||
| | 8 | 128, 128 | 15355.84 | 14730.69 | | 1464.03 | 4717.62 |
|
||||
| | | 128, 2048 | 21195.88 | 17061.82 | | 2303.31 | 5241.5 |
|
||||
| | | 128, 4096 | 16941.52 | 14171.43 | | 2018.22 | 3724.67 |
|
||||
| | | 500, 2000 | 17278.4 | 14679.33 | | 1971.96 | 4445.37 |
|
||||
| | | 1000, 1000 | 13181.24 | 11451.16 | | 1333.62 | 3320.41 |
|
||||
| | | 2048, 128 | 1983.03 | 1923.41 | | 176.16 | 542.38 |
|
||||
| | | 2048, 2048 | 11142.47 | 8801.95 | | 1200.16 | 2553.71 |
|
||||
| | | 5000, 500 | 2717.83 | 2457.42 | | 259.71 | 696.34 |
|
||||
| | | 20000, 2000 | 1920.45 | 1512.6 | | 209.87 | 413.38 |
|
||||
| LLaMA v3.1 405B | 8 | 128, 128 | 3874.19 | | | | |
|
||||
| | | 128, 2048 | 5938.09 | | | | |
|
||||
| | | 128, 4096 | 5168.37 | | | | |
|
||||
| | | 500, 2000 | 5084.29 | | | | |
|
||||
| | | 1000, 1000 | 3399.69 | | | | |
|
||||
| | | 2048, 128 | 463.42 | | | | |
|
||||
| | | 2048, 2048 | 2940.62 | | | | |
|
||||
| | | 5000, 500 | 669.13 | | | | |
|
||||
| | | 20000, 2000 | 535.31 | | | | |
|
||||
| Mistral 7B | 1 | 128, 128 | 31938.12 | 31674.49 | 32498.47 | 9664.13 | 6982.53 |
|
||||
| | | 128, 2048 | 27409.3 | 23496.42 | 23337.29 | 5720.65 | 5630.62 |
|
||||
| | | 128, 4096 | 18505.03 | 14350.99 | 15017.88 | 3136.33 | 3591.22 |
|
||||
| | | 500, 2000 | 22354.67 | 18026.27 | 18556 | 4521.77 | 4400.48 |
|
||||
| | | 1000, 1000 | 18426.16 | 16035.66 | 17252.11 | 4177.76 | 3896.58 |
|
||||
| | | 2048, 128 | 3834.43 | 3642.48 | 3813.13 | 1076.74 | 808.58 |
|
||||
| | | 2048, 2048 | 12347.37 | 9958.17 | 10755.94 | 2286.71 | 2489.77 |
|
||||
| | | 5000, 500 | 4041.59 | 3591.33 | 3949.66 | 1001.02 | 844.64 |
|
||||
| | | 20000, 2000 | 1822.69 | 1373.24 | 1601.28 | 337.83 | 332.3 |
|
||||
| Mixtral 8x7B | 1 | 128, 128 | 17157.72 | 15962.49 | 16859.18 | | |
|
||||
| | | 128, 2048 | 15095.21 | 8290.13 | 11120.16 | | |
|
||||
| | | 128, 4096 | 9534.62 | 4784.86 | 6610.47 | | |
|
||||
| | | 500, 2000 | 12105.27 | 6800.6 | 9192.86 | | |
|
||||
| | | 1000, 1000 | 10371.36 | 6868.52 | 8849.18 | | |
|
||||
| | | 2048, 128 | 2009.67 | 1892.81 | 1994.31 | | |
|
||||
| | | 2048, 2048 | 6940.32 | 3983.1 | 5545.46 | | |
|
||||
| | | 5000, 500 | 2309.1 | 1764.7 | 2078.27 | | |
|
||||
| | | 20000, 2000 | 1151.78 | 673.7 | 860.68 | | |
|
||||
| | 2 | 128, 128 | 27825.34 | 27451.13 | | | 5541.47 |
|
||||
| | | 128, 2048 | 29584.05 | 22830.08 | | | 4169.78 |
|
||||
| | | 128, 4096 | 21564.68 | 14237.01 | | | 2608.05 |
|
||||
| | | 500, 2000 | 23410.63 | 17036.04 | | | 3446.37 |
|
||||
| | | 1000, 1000 | 19151.19 | 15770.89 | | | 3154.52 |
|
||||
| | | 2048, 128 | 3383.16 | 3333.68 | | | 649 |
|
||||
| | | 2048, 2048 | 14007.29 | 10685.85 | | | 2056.58 |
|
||||
| | | 5000, 500 | 4223.68 | 3646.09 | | | 724.44 |
|
||||
| | | 20000, 2000 | 2299.21 | 1757.45 | | | 337.51 |
|
||||
| | 4 | 128, 128 | 42551.59 | 41068.23 | | 6921.87 | 10324.28 |
|
||||
| | | 128, 2048 | 52291.78 | 41164.73 | | 7996.93 | 10911.86 |
|
||||
| | | 128, 4096 | 39513.73 | 27912.48 | | 5736.09 | 7666.51 |
|
||||
| | | 500, 2000 | 43818.99 | 34489.34 | | 6914.68 | 8456.21 |
|
||||
| | | 1000, 1000 | 33580.9 | 27784.74 | | 5251.49 | 7122.84 |
|
||||
| | | 2048, 128 | 5467.62 | 5234.98 | | 827.62 | 1237.62 |
|
||||
| | | 2048, 2048 | 24980.93 | 19432.08 | | 3935.32 | 5222.98 |
|
||||
| | | 5000, 500 | 7084.94 | 6401.56 | | 1092.88 | 1500.55 |
|
||||
| | | 20000, 2000 | 4236.84 | 3303.83 | | 682.48 | 829.59 |
|
||||
| | 8 | 128, 128 | 53212.55 | 50849.55 | | 6740.84 | 17043.54 |
|
||||
| | | 128, 2048 | 68608.45 | 61607.7 | | 10393.3 | 20277.88 |
|
||||
| | | 128, 4096 | 54827.78 | 48280.37 | | 8472.35 | 15282.89 |
|
||||
| | | 500, 2000 | 58706.39 | 52583.65 | | 8660.71 | 17184.24 |
|
||||
| | | 1000, 1000 | 44705.48 | 40631.71 | | 5947.72 | 12851.44 |
|
||||
| | | 2048, 128 | 7554.38 | 6988.18 | | 811.96 | 2165.52 |
|
||||
| | | 2048, 2048 | 36193.64 | 30983.35 | | 5136.98 | 9809.76 |
|
||||
| | | 5000, 500 | 10271.8 | 9210.11 | | 1153.76 | 2761.28 |
|
||||
| | | 20000, 2000 | 6835.53 | 5602.43 | | 918.95 | 1592.53 |
|
||||
| Mixtral 8x22B | 8 | 128, 128 | 22948.57 | 21876.08 | | | 6381.95 |
|
||||
| | | 128, 2048 | 32415.81 | 25150.03 | | | 6685.99 |
|
||||
| | | 128, 4096 | 25753.14 | 18387.4 | | | 4789.13 |
|
||||
| | | 500, 2000 | 27429.6 | 21421.86 | | | 5648.46 |
|
||||
| | | 1000, 1000 | 19712.35 | 16573.24 | | | 4549.46 |
|
||||
| | | 2048, 128 | 2899.84 | 2794.97 | | | 761.56 |
|
||||
| | | 2048, 2048 | 15798.59 | 12244.93 | | | 3521.98 |
|
||||
| | | 5000, 500 | 4031.79 | 3645.27 | | | 959.14 |
|
||||
| | | 20000, 2000 | 2815.76 | 2227.63 | | | 575.02 |
|
||||
### FP4 Models:
|
||||
```
|
||||
nvidia/Llama-3.3-70B-Instruct-FP4
|
||||
nvidia/Llama-3.1-405B-Instruct-FP4
|
||||
```
|
||||
|
||||
*TP stands for Tensor Parallelism*
|
||||
#### Llama 3.3 70B FP4
|
||||
| | GPU | B200 | | | |
|
||||
|:-----------------------------|:---|:----------|:----------|:----------|:----------|
|
||||
| | TP Size | 1 | 2 | 4 | 8 |
|
||||
| ISL, OSL| | | | | |
|
||||
| | | | | | |
|
||||
| 128, 128 | | 11,253.28 | 17,867.66 | 24,944.50 | 27,471.49 |
|
||||
| 128, 2048 | | 9,925.00 | 15,459.71 | 23,608.58 | 30,742.86 |
|
||||
| 128, 4096 | | 6,318.92 | 8,711.88 | 17,659.74 | 24,947.05 |
|
||||
| 500, 2000 | | 7,559.88 | 10,602.27 | 20,910.23 | 28,182.34 |
|
||||
| 1000, 1000 | | 6,866.96 | 10,838.01 | 16,567.86 | 19,991.64 |
|
||||
| 1000, 2000 | | 6,736.88 | 9,132.08 | 15,737.02 | 20,518.04 |
|
||||
| 1024, 2048 | | 6,580.56 | 8,767.45 | 15,722.55 | 20,437.96 |
|
||||
| 2048, 128 | | 1,375.49 | 1,610.69 | 2,707.58 | 3,717.82 |
|
||||
| 2048, 2048 | | 4,544.73 | 6,956.14 | 12,292.23 | 15,661.22 |
|
||||
| 5000, 500 | | 1,488.19 | 2,379.73 | 3,588.45 | 4,810.21 |
|
||||
| 20000, 2000 | | 580.96 | 1,043.58 | 1,957.84 | 3,167.30 |
|
||||
|
||||
#### Llama 3.1 405B FP4
|
||||
| | GPU | B200 |
|
||||
|:-----------------------------|:---|:----------|
|
||||
| | TP Size | 8 |
|
||||
| ISL, OSL| | |
|
||||
| | | |
|
||||
| 128, 128 | | 9,184.83 |
|
||||
| 128, 2048 | | 10,387.23 |
|
||||
| 128, 4096 | | 8,741.80 |
|
||||
| 500, 2000 | | 9,242.34 |
|
||||
| 1000, 1000 | | 7,565.50 |
|
||||
| 1000, 2000 | | 7,696.76 |
|
||||
| 1024, 2048 | | 7,568.93 |
|
||||
| 2048, 128 | | 953.57 |
|
||||
| 2048, 2048 | | 6,092.32 |
|
||||
| 5000, 500 | | 1,332.22 |
|
||||
| 20000, 2000 | | 961.58 |
|
||||
|
||||
### FP8 Models:
|
||||
```
|
||||
nvidia/Llama-3.1-8B-Instruct-FP8
|
||||
nvidia/Llama-3.1-70B-Instruct-FP8
|
||||
nvidia/Llama-3.1-405B-Instruct-FP8
|
||||
```
|
||||
|
||||
#### Llama 3.1 8B FP8
|
||||
| | GPU | H200 141GB HBM3 | H100 80GB HBM3 |
|
||||
|:-----------------------------|:---|:------------------|:-----------------|
|
||||
| | TP Size | 1 | 1 |
|
||||
| ISL, OSL | | | |
|
||||
| | | | |
|
||||
| 128, 128 | | 28,447.38 | 27,568.68 |
|
||||
| 128, 2048 | | 23,294.74 | 22,003.62 |
|
||||
| 128, 4096 | | 17,481.48 | 13,640.35 |
|
||||
| 500, 2000 | | 21,462.57 | 17,794.39 |
|
||||
| 1000, 1000 | | 17,590.60 | 15,270.02 |
|
||||
| 1000, 2000 | | 17,139.51 | 13,850.22 |
|
||||
| 1024, 2048 | | 16,970.63 | 13,374.15 |
|
||||
| 2048, 128 | | 3,531.33 | 3,495.05 |
|
||||
| 2048, 2048 | | 12,022.38 | 9,653.67 |
|
||||
| 5000, 500 | | 3,851.65 | 3,371.16 |
|
||||
| 20000, 2000 | | 1,706.06 | 1,340.92 |
|
||||
|
||||
#### Llama 3.1 70B FP8
|
||||
| | GPU | H200 141GB HBM3 | | | | H100 80GB HBM3 | | | |
|
||||
|:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------|
|
||||
| | TP Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 |
|
||||
| ISL, OSL| | | | | | | | | |
|
||||
| | | | | | | | | | |
|
||||
| 128, 128 | | 3,657.58 | 6,477.50 | 10,466.04 | 15,554.57 | 3,191.27 | 6,183.41 | 10,260.68 | 14,686.01 |
|
||||
| 128, 2048 | | 4,351.07 | 8,450.31 | 13,438.71 | 20,750.58 | 745.19 | 5,822.02 | 11,442.01 | 17,463.99 |
|
||||
| 128, 4096 | | 2,696.61 | 5,598.92 | 11,524.93 | 16,634.90 | | 3,714.87 | 8,209.91 | 12,598.55 |
|
||||
| 500, 2000 | | 3,475.58 | 6,712.35 | 12,332.32 | 17,311.28 | | 4,704.31 | 10,278.02 | 14,630.41 |
|
||||
| 1000, 1000 | | 2,727.42 | 5,097.36 | 8,698.15 | 12,794.92 | 734.67 | 4,191.26 | 7,427.35 | 11,082.48 |
|
||||
| 1000, 2000 | | 2,913.54 | 5,841.15 | 9,016.49 | 13,174.68 | 526.31 | 3,920.44 | 7,590.35 | 11,108.11 |
|
||||
| 1024, 2048 | | 2,893.02 | 5,565.28 | 9,017.72 | 13,117.34 | 525.43 | 3,896.14 | 7,557.32 | 11,028.32 |
|
||||
| 2048, 128 | | 433.30 | 772.97 | 1,278.26 | 1,947.33 | 315.90 | 747.51 | 1,240.12 | 1,840.12 |
|
||||
| 2048, 2048 | | 1,990.25 | 3,822.83 | 7,068.68 | 10,529.06 | 357.98 | 2,732.86 | 5,640.31 | 8,772.88 |
|
||||
| 5000, 500 | | 543.88 | 1,005.81 | 1,714.77 | 2,683.22 | 203.27 | 866.77 | 1,571.92 | 2,399.78 |
|
||||
| 20000, 2000 | | 276.99 | 618.01 | 1,175.35 | 2,021.08 | | 408.43 | 910.77 | 1,568.84 |
|
||||
|
||||
#### Llama 3.1 405B FP8
|
||||
| | GPU | H200 141GB HBM3 | H100 80GB HBM3 |
|
||||
|:-----------------------------|:---|:------------------|:-----------------|
|
||||
| | TP Size | 8 | 8 |
|
||||
| ISL, OSL | | | |
|
||||
| | | | |
|
||||
| 128, 128 | | 3,800.11 | 3,732.40 |
|
||||
| 128, 2048 | | 5,661.13 | 4,572.23 |
|
||||
| 128, 4096 | | 5,167.18 | 2,911.42 |
|
||||
| 500, 2000 | | 4,854.29 | 3,661.85 |
|
||||
| 1000, 1000 | | 3,332.15 | 2,963.36 |
|
||||
| 1000, 2000 | | 3,682.15 | 3,253.17 |
|
||||
| 1024, 2048 | | 3,685.56 | 3,089.16 |
|
||||
| 2048, 128 | | 453.42 | 448.89 |
|
||||
| 2048, 2048 | | 3,055.73 | 2,139.94 |
|
||||
| 5000, 500 | | 656.11 | 579.14 |
|
||||
| 20000, 2000 | | 514.02 | 370.26 |
|
||||
|
||||
## Reproducing Benchmarked Results
|
||||
|
||||
@ -168,25 +131,14 @@ The performance numbers below were collected using the steps described in this d
|
||||
The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
|
||||
description of this benchmarking workflow, see the [benchmarking suite documentation](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html).
|
||||
|
||||
### Commands
|
||||
### Command Overview
|
||||
|
||||
Starting with v0.19, testing was performed using the PyTorch backend - this workflow does not require an engine to be built.
|
||||
|
||||
#### For systems other than GH200
|
||||
| Stage | Description | Command |
|
||||
| :- | - | - |
|
||||
| [Dataset](#preparing-a-dataset) | Create a synthetic dataset | `python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file` |
|
||||
| [Build](#engine-building) | Build a TensorRT-LLM engine | `trtllm-bench --model $model_name build --tp_size $tp_size --pp_size $pp_size --quantization FP8 --dataset $dataset_file` |
|
||||
| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --engine_dir $engine_dir` |
|
||||
|
||||
#### For GH200 systems only
|
||||
For release v0.17, on GH200 systems, the recommendation is to use the legacy flow based on *gptManagerBenchmark* to measure performance.
|
||||
|
||||
| Stage | Description | Command |
|
||||
| :- | - | - |
|
||||
| [Dataset](#preparing-a-dataset) | Create a synthetic dataset for engine building | `python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file` |
|
||||
| [Build](#engine-building) | Build a TensorRT-LLM engine | `trtllm-bench --model $model_name build --tp_size $tp_size --quantization FP8 --dataset $dataset_file` |
|
||||
| [Dataset](#preparing-a-dataset) | Create a synthetic dataset for benchmarking in json format | `python benchmarks/cpp/prepare_dataset.py --output=$dataset_file_json --tokenizer=$model_name token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0` |
|
||||
| [Run](#running-the-benchmark) | Run a benchmark with a dataset in json format | `/app/tensorrt_llm/benchmarks/cpp/gptManagerBenchmark --engine_dir $engine_dir --type IFB --api executor --dataset $dataset_file_json --eos_id -1 --log_iteration_data --scheduler_policy guaranteed_no_evict --kv_cache_free_gpu_mem_fraction 0.95 --output_csv result.csv --request_rate -1.0 --enable_chunked_context --warm_up 0` |
|
||||
|
||||
| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options` |
|
||||
|
||||
### Variables
|
||||
|
||||
@ -196,11 +148,11 @@ For release v0.17, on GH200 systems, the recommendation is to use the legacy flo
|
||||
| `$osl` | Benchmark output sequence length. |
|
||||
| `$tp_size` | Tensor parallel mapping degree to run the benchmark with |
|
||||
| `$pp_size` | Pipeline parallel mapping degree to run the benchmark with |
|
||||
| `$engine_dir` | Location to store built engine file (can be deleted after running benchmarks). |
|
||||
| `$model_name` | HuggingFace model name eg. meta-llama/Llama-2-7b-hf or use the path to a local weights directory |
|
||||
| `$dataset_file` | Location of the dataset file generated by `prepare_dataset.py` |
|
||||
| `$num_requests` | The number of requests to generate for dataset generation |
|
||||
| `$seq_len` | A sequence length of ISL + OSL |
|
||||
| `$llm_options` | (optional) A yaml file containing additional options for the LLM API |
|
||||
|
||||
### Preparing a Dataset
|
||||
|
||||
@ -228,6 +180,7 @@ remain in the system longer and therefore require less requests to achieve stead
|
||||
| 128 | 128 | 256 | 30000 |
|
||||
| 128 | 2048 | 2176 | 3000 |
|
||||
| 128 | 4096 | 4224 | 1500 |
|
||||
| 1000 | 2000 | 3000 | 1500 |
|
||||
| 2048 | 128 | 2176 | 3000 |
|
||||
| 2048 | 2048 | 4096 | 1500 |
|
||||
| 5000 | 500 | 5500 | 1500 |
|
||||
@ -235,43 +188,39 @@ remain in the system longer and therefore require less requests to achieve stead
|
||||
| 500 | 2000 | 2500 | 3000 |
|
||||
| 20000 | 2000 | 22000 | 1000 |
|
||||
|
||||
### Engine Building
|
||||
|
||||
All engines are built using the `trtllm-bench build` subcommand.
|
||||
The basic command for FP8 quantized engines is as follows:
|
||||
|
||||
```
|
||||
trtllm-bench --model $model_name build --tp_size $tp_size --pp_size $pp_size --quantization FP8 --dataset $dataset_file
|
||||
```
|
||||
When providing `--dataset` in the build subcommand, `trtllm-bench build` uses high-level statistics of the dataset (average ISL/OSL, max sequence length) and tuning heuristics to optimize engine build settings.
|
||||
|
||||
Alternatively, if you would like to build the engine with specific settings, you can do so by specifying the values for `max_batch_size` and `max_num_tokens`:
|
||||
|
||||
```
|
||||
trtllm-bench --model $model_name build --tp_size $tp_size --pp_size $pp_size --quantization FP8 --max_seq_len $seq_len --max_batch_size $max_bs --max_num_tokens $max_token
|
||||
```
|
||||
|
||||
If you would like to build an FP16 engine without any quantization, simply remove the `--quantization FP8` option. If using pre-quantized weights (e.g. `nvidia/Llama-3.1-70B-Instruct-FP8` from HuggingFace), please set the `--quantization` argument to the model dtype to ensure the KV Cache is set to the appropriate dtype.
|
||||
|
||||
> [!NOTE] If you specify FP8 quantization, the KV cache will automatically be set to FP8 as well!
|
||||
|
||||
The `trtllm-bench build` subcommand will output the path where the engine is located upon a successful build. For example,
|
||||
|
||||
```shell
|
||||
===========================================================
|
||||
ENGINE SAVED: /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
||||
===========================================================
|
||||
```
|
||||
|
||||
### Running the Benchmark
|
||||
|
||||
### For non GH200 systems
|
||||
To run the benchmark with the generated data set, simply use the `trtllm-bench throughput` subcommand. The benchmarker will
|
||||
run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide
|
||||
the patch to the engine from the [build](#engine-building) phase and a [generated dataset](#preparing-a-dataset).
|
||||
a model name (HuggingFace reference or path to a local model), a [generated dataset](#preparing-a-dataset), and a file containing any desired extra options to the LLMApi (details in [tensorrt_llm/llmapi/llm_args.py:LlmArgs](../../../tensorrt_llm/llmapi/llm_args.py)).
|
||||
|
||||
```shell
|
||||
trtllm-bench --model $model_name throughput --dataset $dataset_file --engine_dir $engine_dir
|
||||
trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
|
||||
```
|
||||
|
||||
`llm_options.yml`
|
||||
```yaml
|
||||
|
||||
pytorch_backend_config:
|
||||
enable_overlap_scheduler: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 384
|
||||
- 512
|
||||
- 1024
|
||||
- 2048
|
||||
- 4096
|
||||
- 8192
|
||||
```
|
||||
|
||||
In majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
|
||||
@ -280,140 +229,50 @@ The results will be printed to the terminal upon benchmark completion. For examp
|
||||
|
||||
```shell
|
||||
===========================================================
|
||||
= ENGINE DETAILS
|
||||
= PERFORMANCE OVERVIEW
|
||||
===========================================================
|
||||
Model: meta-llama/Llama-2-7b-hf
|
||||
Engine Directory: /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
||||
TensorRT-LLM Version: 0.12.0
|
||||
Dtype: float16
|
||||
KV Cache Dtype: FP8
|
||||
Quantization: FP8
|
||||
Max Input Length: 2048
|
||||
Max Sequence Length: 4098
|
||||
Request Throughput (req/sec): 43.2089
|
||||
Total Output Throughput (tokens/sec): 5530.7382
|
||||
Per User Output Throughput (tokens/sec/user): 2.0563
|
||||
Per GPU Output Throughput (tokens/sec/gpu): 5530.7382
|
||||
Total Token Throughput (tokens/sec): 94022.5497
|
||||
Total Latency (ms): 115716.9214
|
||||
Average request latency (ms): 75903.4456
|
||||
Per User Output Speed [1/TPOT] (tokens/sec/user): 5.4656
|
||||
Average time-to-first-token [TTFT] (ms): 52667.0339
|
||||
Average time-per-output-token [TPOT] (ms): 182.9639
|
||||
|
||||
===========================================================
|
||||
= WORLD + RUNTIME INFORMATION
|
||||
===========================================================
|
||||
TP Size: 1
|
||||
PP Size: 1
|
||||
Max Runtime Batch Size: 4096
|
||||
Max Runtime Tokens: 8192
|
||||
Scheduling Policy: Guaranteed No Evict
|
||||
KV Memory Percentage: 99.0%
|
||||
Issue Rate (req/sec): 3.680275266452667e+18
|
||||
===========================================================
|
||||
= STATISTICS
|
||||
===========================================================
|
||||
Number of requests: 3000
|
||||
Average Input Length (tokens): 128.0
|
||||
Average Output Length (tokens): 128.0
|
||||
Token Throughput (tokens/sec): 23405.927228471104
|
||||
Request Throughput (req/sec): 182.8588064724305
|
||||
Total Latency (seconds): 16.406100739
|
||||
===========================================================
|
||||
-- Per-Request Time-per-Output-Token [TPOT] Breakdown (ms)
|
||||
|
||||
[TPOT] MINIMUM: 32.8005
|
||||
[TPOT] MAXIMUM: 208.4667
|
||||
[TPOT] AVERAGE: 182.9639
|
||||
[TPOT] P50 : 204.0463
|
||||
[TPOT] P90 : 206.3863
|
||||
[TPOT] P95 : 206.5064
|
||||
[TPOT] P99 : 206.5821
|
||||
|
||||
-- Per-Request Time-to-First-Token [TTFT] Breakdown (ms)
|
||||
|
||||
[TTFT] MINIMUM: 3914.7621
|
||||
[TTFT] MAXIMUM: 107501.2487
|
||||
[TTFT] AVERAGE: 52667.0339
|
||||
[TTFT] P50 : 52269.7072
|
||||
[TTFT] P90 : 96583.7187
|
||||
[TTFT] P95 : 101978.4566
|
||||
[TTFT] P99 : 106563.4497
|
||||
|
||||
-- Request Latency Breakdown (ms) -----------------------
|
||||
|
||||
[Latency] P50 : 78509.2102
|
||||
[Latency] P90 : 110804.0017
|
||||
[Latency] P95 : 111302.9101
|
||||
[Latency] P99 : 111618.2158
|
||||
[Latency] MINIMUM: 24189.0838
|
||||
[Latency] MAXIMUM: 111668.0964
|
||||
[Latency] AVERAGE: 75903.4456
|
||||
```
|
||||
|
||||
> [!WARNING] In some cases, the benchmarker may not print anything at all. This behavior usually
|
||||
means that the benchmark has hit an out of memory issue. Try reducing the KV cache percentage
|
||||
using the `--kv_cache_free_gpu_mem_fraction` option to lower the percentage of used memory.
|
||||
|
||||
## Online Serving Measurements
|
||||
|
||||
The [TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend) is used to measure the performance of TensorRT-LLM for online serving.
|
||||
|
||||
The below table shows the throughput and latency under a serving scenario.
|
||||
|
||||
**All data in the table below was generated using version 0.14.0, with 500 requests and BF16 precision.**
|
||||
|
||||
| | | | | | | | | | | |
|
||||
| --------------- | -------------------| --------| --------| --------| --------|------------------| ------------------ | ------------------ | ----------------------------- |------------------------ |
|
||||
| **Model** | **GPU** | **TP** | **Input Length** | **Output Length** | **QPS** | **Tput(req/s)** | **Mean TTFT(ms)** | **Mean ITL(ms)** | **Total Token Tput (tok/s)** | **Output Tput (tok/s)** |
|
||||
|LLaMA 3.1 70B|H100 80GB HBM3|4|467|256|2|2|62|21|1406|498||
|
||||
||||||4|4|68|24|2750|973|
|
||||
||||||8|7|92|32|5256|1860|
|
||||
||||||16|12|175|66|8941|3164|
|
||||
||||||32|16|1229|86|11537|4083|
|
||||
||||||INF|16|9123|85|11593|4103|
|
||||
||||467|16|2|2|53|18|844|28|
|
||||
||||||4|4|58|20|1908|63|
|
||||
||||||8|8|71|24|3795|126|
|
||||
||||||16|16|109|38|7492|248|
|
||||
||||||32|28|1197|482|13655|452|
|
||||
||||||INF|28|9126|548|13719|454|
|
||||
||||202|214|2|2|48|20|780|401|
|
||||
||||||4|4|51|22|1499|771|
|
||||
||||||8|7|57|25|2702|1390|
|
||||
||||||16|11|74|32|4364|2245|
|
||||
||||||32|14|116|42|5837|3003|
|
||||
||||||INF|16|4482|50|6725|3459|
|
||||
|LLaMA 3.1 8B||1|467|256|2|2|23|8|1423|504|
|
||||
||||||4|4|24|9|2624|929|
|
||||
||||||8|8|26|9|5535|1959|
|
||||
||||||16|15|30|11|10636|3765|
|
||||
||||||32|26|50|19|19138|6774|
|
||||
||||||INF|37|3335|39|26614|9420|
|
||||
||||467|16|2|2|19|7|956|32|
|
||||
||||||4|4|20|7|1910|63|
|
||||
||||||8|8|22|7|3808|126|
|
||||
||||||16|16|24|8|7567|251|
|
||||
||||||32|31|29|10|14894|493|
|
||||
||||||INF|79|3280|193|38319|1269|
|
||||
||||202|214|2|2|19|7|809|416|
|
||||
||||||4|4|20|8|1586|816|
|
||||
||||||8|7|21|9|3047|1568|
|
||||
||||||16|13|23|10|5597|2879|
|
||||
||||||32|23|27|11|9381|4825|
|
||||
||||||INF|39|1657|21|16117|8291|
|
||||
|LLaMA 3.1 70B|H200 131GB HBM3|4|467|256|2|2|58|18|1411|499|
|
||||
||||||4|4|63|20|2770|980|
|
||||
||||||8|7|84|27|5328|1886|
|
||||
||||||16|13|165|60|9224|3264|
|
||||
||||||32|16|1279|83|11800|4176|
|
||||
||||||INF|16|9222|83|11826|4185|
|
||||
||||467|16|2|2|50|15|956|32|
|
||||
||||||4|4|55|16|1909|63|
|
||||
||||||8|8|67|20|3799|126|
|
||||
||||||16|16|103|33|7499|248|
|
||||
||||||32|28|1259|485|13586|450|
|
||||
||||||INF|29|9074|546|13792|457|
|
||||
||||202|214|2|2|43|17|793|408|
|
||||
||||||4|4|46|18|1524|784|
|
||||
||||||8|7|51|21|2796|1438|
|
||||
||||||16|11|67|28|4639|2386|
|
||||
||||||32|15|112|39|6288|3235|
|
||||
||||||INF|17|4480|48|7230|3719|
|
||||
|LLaMA 3.1 8B|H200 131GB HBM3|1|467|256|2|2|21|6|1425|504|
|
||||
||||||4|4|23|7|2828|1001|
|
||||
||||||8|8|24|7|5567|1971|
|
||||
||||||16|15|27|9|10761|3809|
|
||||
||||||32|27|44|16|19848|7025|
|
||||
||||||INF|40|3237|36|28596|10121|
|
||||
||||467|16|2|2|18|5|956|32|
|
||||
||||||4|4|19|6|1910|63|
|
||||
||||||8|8|20|6|3810|126|
|
||||
||||||16|16|22|7|7567|250|
|
||||
||||||32|31|27|9|14927|494|
|
||||
||||||INF|81|3227|190|39007|1291|
|
||||
||||202|214|2|2|17|6|812|418|
|
||||
||||||4|4|18|6|1597|822|
|
||||
||||||8|7|19|7|3088|1589|
|
||||
||||||16|14|20|8|5771|2969|
|
||||
||||||32|24|24|9|9931|5109|
|
||||
||||||INF|43|1665|19|17861|9189|
|
||||
|
||||
*TP stands for Tensor Parallelism*
|
||||
|
||||
*TTFT stands for Time To First Token*
|
||||
|
||||
*ITL stands for Inter Token Latency*
|
||||
|
||||
|
||||
|
||||
### For GH200 systems only
|
||||
For release v0.17, on GH200 systems, the recommendation is to use *gptManagerBenchmark* to measure performance. Throughput measurements are reported based on the below commands.
|
||||
```shell
|
||||
/app/tensorrt_llm/benchmarks/cpp/gptManagerBenchmark --engine_dir $engine_dir --type IFB --dataset $dataset_file_json --eos_id -1 --scheduler_policy guaranteed_no_evict --kv_cache_free_gpu_mem_fraction 0.95 --output_csv result.csv --request_rate -1.0 --enable_chunked_context --warm_up 0
|
||||
```
|
||||
|
||||
The command will run the `gptManagerBenchmark` binary that will report the throughput and other metrics as part of its output
|
||||
that can be compared with the table in the [Throughput Measurements](#throughput-measurements) of this README.
|
||||
|
||||
@ -15,7 +15,7 @@ Here is a simple example to show how to use the LLM API with TinyLlama.
|
||||
```
|
||||
|
||||
You can also directly load TensorRT Model Optimizer's [quantized checkpoints on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) in the LLM constructor.
|
||||
To learn more about the LLM API, check out the [](llm-api/index) and [](llm-api-examples/index).
|
||||
To learn more about the LLM API, check out the [](llm-api/index) and [](examples/llm_api_examples).
|
||||
|
||||
(deploy-with-trtllm-serve)=
|
||||
## Deploy with trtllm-serve
|
||||
@ -151,7 +151,7 @@ In this Quick Start Guide, you:
|
||||
|
||||
For more examples, refer to:
|
||||
|
||||
- [examples/](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for showcases of how to run a quick benchmark on latest LLMs.
|
||||
- [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for showcases of how to run a quick benchmark on latest LLMs.
|
||||
|
||||
## Related Information
|
||||
|
||||
|
||||
@ -69,27 +69,19 @@ Before KV cache blocks are allocated, some amount of GPU memory are pre-allocate
|
||||
|
||||
##### C++ runtime
|
||||
|
||||
* When paged KV cache is enabled
|
||||
|
||||
TensorRT-LLM runtime pre-allocates KV cache tensors during initialization for a configured number of blocks and distributes them at runtime.
|
||||
TensorRT-LLM runtime pre-allocates paged KV cache pools during initialization for a configured number of blocks and distributes them at runtime.
|
||||
|
||||
KV cache tensors are allocated based on the `KVCacheConfig` object when creating the `Executor`. If neither `maxTokens` nor `freeGpuMemoryFraction` is specified, KV cache will by default allocate 90% of the remaining free GPU memory. When either `maxTokens` or `freeGpuMemoryFraction` is specified, the specified value will be used to compute the KV cache memory size. And if both are specified, firstly the `freeGpuMemoryFraction` is used to compute the number of tokens in KV cache, and then the minimum between this computed number of tokens and `maxTokens` is used.
|
||||
|
||||
In in-flight batching the scheduler can automatically schedule requests as long as enough KV cache space is available (exact behavior depends on the scheduler policy).
|
||||
|
||||
If paged KV cache is used in `GptSession` (already deprecated) without in-flight batching, TensorRT-LLM may report OOM errors with message "Can't allocate new blocks. No free blocks left", if the paged KV cache is not large enough for the whole batch.
|
||||
|
||||
* When paged KV cache is disabled (Not recommended and only allowed for deprecated `GptSession`)
|
||||
|
||||
C++ runtime allocates the KV cache tensors for each layer with shape `[batch size, 2, heads, max seq length, hidden dimension per head]`, where `max seq length` is specified by `GptSession::Config::maxSequenceLength` when creating `GptSession`.
|
||||
|
||||
##### Python runtime (Not recommended to be used)
|
||||
|
||||
The Python runtime allocates KV cache tensors based on the parameters of the `GenerationSession.setup` function, the KV cache size is linearly dependent on the `batch_size` and `max_context_length+max_new_tokens`. **Note: This may change in the future, as the Python bindings of the C++ runtime may replace the current python runtime in the future. The Python bindings of C++ runtime behave like C++ runtime.**
|
||||
|
||||
## Memory pool
|
||||
|
||||
TensorRT-LLM C++ runtime is using stream-ordered memory allocator to allocate and free buffers, see [BufferManager::initMemoryPool](source:cpp/tensorrt_llm/runtime/bufferManager.cpp), which uses the default memory pool managed by the CUDA driver. When a `GptSession` object is destroyed, memory is returned to the memory pool and can be reused by the next instance of a `GptSession` object. Memory will be released from the pool if it is required for other memory allocations.
|
||||
TensorRT-LLM C++ runtime is using stream-ordered memory allocator to allocate and free buffers, see [BufferManager::initMemoryPool](source:cpp/tensorrt_llm/runtime/bufferManager.cpp), which uses the default memory pool managed by the CUDA driver. When a `TrtGptModel` object is destroyed, memory is returned to the memory pool and can be reused by the next instance of a `TrtGptModel` object. Memory will be released from the pool if it is required for other memory allocations.
|
||||
|
||||
However, `nvidia-smi` may still show high memory occupation after memory is returned to the CUDA driver's memory pool. This should not be a concern and is intended behavior. The amount of reserved and free memory in the pool can be inspected by [BufferManager::memoryPoolReserved())](source:cpp/tensorrt_llm/runtime/bufferManager.cpp) and [BufferManager::memoryPoolFree())](source:cpp/tensorrt_llm/runtime/bufferManager.cpp), respectively.
|
||||
|
||||
|
||||
@ -4,7 +4,34 @@
|
||||
|
||||
TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA GPUs. The following sections provide a list of supported GPU architectures as well as important features implemented in TensorRT-LLM.
|
||||
|
||||
## Models
|
||||
## Models (PyTorch Backend)
|
||||
|
||||
| Architecture | Model | HuggingFace Example | Modality |
|
||||
|--------------|-------|---------------------|----------|
|
||||
| `BertForSequenceClassification` | BERT-based | `textattack/bert-base-uncased-yelp-polarity` | L |
|
||||
| `DeciLMForCausalLM` | Nemotron | `nvidia/Llama-3_1-Nemotron-51B-Instruct` | L |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3 `| L |
|
||||
| `LlavaLlamaModel` | VILA | `Efficient-Large-Model/NVILA-8B` | L + V |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | `llava-hf/llava-v1.6-mistral-7b-hf` | L + V |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA | `meta-llama/Meta-Llama-3.1-70B` | L |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | `meta-llama/Llama-4-Scout-17B-16E-Instruct` | L |
|
||||
| `MistralForCausalLM` | Mistral | `mistralai/Mistral-7B-v0.1` | L |
|
||||
| `MixtralForCausalLM` | Mixtral | `mistralai/Mixtral-8x7B-v0.1` | L |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | `meta-llama/Llama-3.2-11B-Vision` | L |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base` | L |
|
||||
| `NemotronNASForCausalLM` | NemotronNAS | `nvidia/Llama-3_3-Nemotron-Super-49B-v1` | L |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/Qwen2-7B-Instruct` | L |
|
||||
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B` | L |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B` | L |
|
||||
| `Qwen2VLForConditionalGeneration` | Qwen2-VL | `Qwen/Qwen2-VL-7B-Instruct` | L + V |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | `Qwen/Qwen2.5-VL-7B-Instruct` | L + V |
|
||||
|
||||
Note:
|
||||
- L: Language only
|
||||
- L + V: Language and Vision multimodal support
|
||||
- Llama 3.2 accepts vision input, but our support currently limited to text only.
|
||||
|
||||
## Models (TensorRT Backend)
|
||||
|
||||
### LLM Models
|
||||
|
||||
@ -115,9 +142,9 @@ The following table shows the supported software for TensorRT-LLM.
|
||||
* -
|
||||
- Software Compatibility
|
||||
* - Container
|
||||
- [25.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
- [25.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
* - TensorRT
|
||||
- [10.9](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
- [10.10](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
* - Precision
|
||||
-
|
||||
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
|
||||
|
||||
@ -5,6 +5,127 @@
|
||||
All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.19.0
|
||||
|
||||
### Key Features and Enhancements
|
||||
- **The C++ runtime is now open sourced.**
|
||||
- **PyTorch workflow**
|
||||
- Added DeepSeek V3/R1 support. Refer to `examples/deepseek_v3/README.md`, also to the blog `docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md`.
|
||||
- Added Llava-Next support.
|
||||
- Added BERT support.
|
||||
- Added a C++ based decoder, which added support for:
|
||||
- TopK / TopP.
|
||||
- Bad words.
|
||||
- Stop words.
|
||||
- Embedding bias.
|
||||
- Added Autotuner for custom-op-compatible tuning process.
|
||||
- Added a Python-based Autotuner core framework for kernel tuning.
|
||||
- Applied the Autotuner to fused MoE and NVFP4 linear operators for concept and performance evaluations.
|
||||
- Added guided decoding support (XGrammar integration).
|
||||
- Added pipeline parallelism support for the overlap scheduler in `PyExecutor`.
|
||||
- Added Qwen2VL model support.
|
||||
- Added mixed precision quantization support.
|
||||
- Added pipeline parallelism with attention DP support.
|
||||
- Added no-cache attention support.
|
||||
- Added `PeftCacheManager` support.
|
||||
- Added Qwen2.5‑VL support and refactored Qwen2‑VL.
|
||||
- Added trtllm‑gen FP4 GEMM support.
|
||||
- Added Qwen2 MoE support.
|
||||
- Applied `AutoTuner` to both Fused MoE and NVFP4 Linear operators.
|
||||
- Introduced a `UserBuffers` allocator.
|
||||
- Added Deepseek eager mode AllReduce fusion support.
|
||||
- Added Multi-Token Prediction (MTP) support. Refer to the “Multi-Token Prediction (MTP)” section of `examples/deepseek_v3/README.md`.
|
||||
- Added FlashMLA support for SM90.
|
||||
- Added support for enabling MTP with CUDA graph padding.
|
||||
- Added initial EAGLE-3 implementation.
|
||||
- Added support for FP8 MLA on NVIDIA Hopper and Blackwell GPUs.
|
||||
- **AutoDeploy for PyTorch workflow**.
|
||||
- The AutoDeploy for PyTorch workflow is an **experimental** feature in `tensorrt_llm._torch.auto_deploy`.
|
||||
- AutoDeploy provides an automated path from off-the-shelf models to optimized deployment in the TensorRT-LLM runtime.
|
||||
- Check out `examples/auto_deploy/README.md` for more details.
|
||||
- LLM API
|
||||
- [BREAKING CHANGE] Added dynamic logits processor support, and deprecated static logits processor.
|
||||
- Added batched logits processor support.
|
||||
- Added EAGLE support.
|
||||
- Added abort request support.
|
||||
- Added `get_stats` support.
|
||||
- Added multi-node support for Slurm-based clusters, refer to `examples/llm-api/llm_mgmn_*.sh`.
|
||||
- Added InternLM-XComposer2 support. Refer to “InternLM-XComposer2” section in `examples/multimodal/README.md`.
|
||||
- Added INT4-AWQ support for MoE models. Refer to the “AWQ Quantization” section in `examples/mixtral/README.md`.
|
||||
- Added Qwen2-Audio support. Refer to `examples/qwen2audio/README.md`.
|
||||
- Added Language-Adapter support. Refer to `examples/language_adapter/README.md`.
|
||||
- Added STDiT for OpenSoRA text-to-video support. Refer to `examples/stdit/README.md`.
|
||||
- Added vision encoders with tensor parallelism and context parallelism support. Refer to `examples/vit/README.md`.
|
||||
- Added EXAONE-Deep support. Refer to `examples/exaone/README.md`.
|
||||
- Added support for Phi-4-mini and Phi‑4‑MM.
|
||||
- Added Gemma3 text‑only model support. Refer to "Run Gemma 3" section at `examples/gemma/README.md`.
|
||||
- Added FP8 quantization support for Qwen2-VL.
|
||||
- Added batched inference support for the LLM API MMLU example `examples/mmlu_llmapi.py`.
|
||||
- Added FP4 quantization-layernorm fusion plugin support. (Llama models only)
|
||||
- Added Mamba-Hybrid support.
|
||||
- Added NVILA video support. The support includes 1 prompt - N media and N prompt - N media batching modes.
|
||||
- Added a `--quantize_lm_head` option `examples/quantization/quantize.py` to support `lm_head` quantization.
|
||||
- Added batched tensor FP4 quantization support.
|
||||
- Added a `/metrics` endpoint for `trtllm-serve` to log iteration statistics.
|
||||
- Added LoRA support for Phi-2 model.
|
||||
- Added returning context logits support for `trtllm-serve`.
|
||||
- Added one-shot version for UserBuffer AllReduce-Normalization on FP16/BF16.
|
||||
- Added request BW metric measurement for `disaggServerBenchmark`.
|
||||
- Updated logits bitmask kernel to v3.
|
||||
- Enabled CUDA graphs when attention DP was used and active requests on different GPUs were uneven.
|
||||
- Added iteration log support for `trtllm-bench`.
|
||||
- `fp8_blockscale_gemm` is now open-sourced.
|
||||
- Added AWQ support for ModelOpt checkpoints.
|
||||
- Added Linear block scale layout support in FP4 quantization.
|
||||
- Added pre-quantized FP8 checkpoint support for Nemotron-mini-4b-instruct.
|
||||
- Added Variable-Beam-Width-Search (VBWS) support (part2).
|
||||
- Added LoRA support for Gemma.
|
||||
- Refactored scaffolding worker, added OpenAI API worker support.
|
||||
- Optionally split MoE inputs into chunks to reduce GPU memory usage.
|
||||
- Added UCX IP interface support.
|
||||
- [BREAKING CHANGE] Added output of first token to additional generation outputs.
|
||||
- Added FP8 support for SM120 architecture.
|
||||
- Registered `ENABLE_MULTI_DEVICE` and `ENABLE_UCX` as CMake options.
|
||||
- Made the scaffolding Controller more generic.
|
||||
- Breaking change: Added individual gatherContext support for each additional output.
|
||||
- Enabled `PyExecutor` inference flow to estimate `max_num_tokens` for `kv_cache_manager`.
|
||||
- Added `TLLM_OVERRIDE_LAYER_NUM` and `TLLM_TRACE_MODEL_FORWARD` environment variables for debugging.
|
||||
- Supported aborting disconnected requests.
|
||||
- Added an option to run disaggregated serving without context servers.
|
||||
- Fixed and improved allreduce and fusion kernels.
|
||||
- Enhanced the integrated robustness of scaffolding via `init.py`.
|
||||
|
||||
### API Changes
|
||||
- Exposed `kc_cache_retention_config` from C++ `executor` API to the LLM API.
|
||||
- Moved `BuildConfig` arguments to `LlmArgs`.
|
||||
- Removed speculative decoding parameters from stateful decoders.
|
||||
- Exposed `DecoderState` via bindings and integrated it in decoder.
|
||||
- Refactored the `LlmArgs` with `Pydantic` and migrated remaining pybinding configurations to Python.
|
||||
- Refactored disaggregated serving scripts.
|
||||
- Added `numNodes` to `ParallelConfig`.
|
||||
- Redesigned the multi‑stream API for DeepSeek.
|
||||
|
||||
### Fixed Issues
|
||||
- Fixed misused length argument of PluginField. Thanks to the contribution from @jl749 in #2712. This also fixes #2685.
|
||||
- Fixed a Llama-3.2 SmoothQuant convert checkpoint issue. (#2677)
|
||||
- Fixed a bug when loading an engine using LoRA through the LLM API. (#2782)
|
||||
- Fixed incorrect batch slot usage in `addCumLogProbs` kernel. Thanks to the contribution from @aotman in #2787.
|
||||
- Fixed incorrect output for Llama-3.2-11B-Vision-Instruct. (#2796)
|
||||
- Removed the necessary of `--extra-index-url https://pypi.nvidia.com` when running `pip install tensorrt-llm`.
|
||||
|
||||
### Infrastructure Changes
|
||||
- The dependent NVIDIA ModelOpt version is updated to 0.27.
|
||||
|
||||
### Known Issues
|
||||
- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.18.2
|
||||
|
||||
### Key Features and Enhancements
|
||||
- This update addresses known security issues. For the latest NVIDIA Vulnerability Disclosure Information visit https://www.nvidia.com/en-us/security/.
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.18.1
|
||||
|
||||
### Key Features and Enhancements
|
||||
@ -65,7 +186,7 @@ All published functionality in the Release Notes has been fully tested and verif
|
||||
### Known Issues
|
||||
- Need `--extra-index-url https://pypi.nvidia.com` when running `pip install tensorrt-llm` due to new third-party dependencies.
|
||||
- The PYPI SBSA wheel is incompatible with PyTorch 2.5.1 due to a break in the PyTorch ABI/API, as detailed in the related [GitHub issue](https://github.com/pytorch/pytorch/issues/144966).
|
||||
- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container (https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
|
||||
- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
|
||||
|
||||
### Fixed Issues
|
||||
- Fixed incorrect LoRA output dimension. Thanks for the contribution from @akhoroshev in #2484.
|
||||
|
||||
@ -41,7 +41,7 @@ scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --ex
|
||||
|
||||
- [Architecture Overview](./torch/arch_overview.md)
|
||||
- [Adding a New Model](./torch/adding_new_model.md)
|
||||
- [Examples](../../examples/pytorch/README.md)
|
||||
- [Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/pytorch/README.md)
|
||||
|
||||
## Key Components
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc2" />
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -330,19 +330,19 @@
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -351,19 +351,19 @@
|
||||
</details></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../examples/customization.html">LLM Common Customizations</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
|
||||
@ -376,6 +376,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||||
@ -453,6 +454,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user