[TRTLLM-9160][doc] add doc to llm_runtime.py (#9482)

Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine <miovine@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-11-27 10:10:17 +08:00 · 2025-11-27 10:10:17 +08:00 · 3e442922a3
commit 3e442922a3
parent 6332bf27e6
1 changed files with 54 additions and 6 deletions
--- a/examples/llm-api/llm_runtime.py
+++ b/examples/llm-api/llm_runtime.py
@ -1,6 +1,50 @@
 ### :title Runtime Configuration Examples
 ### :order 6
 ### :section Customization
+'''
+This script demonstrates various runtime configuration options in TensorRT-LLM,
+including KV cache management and CUDA graph optimizations.
+
+**KV Cache Configuration:**
+
+The KV cache (key-value cache) stores attention keys and values during inference,
+which is crucial for efficient autoregressive generation. Proper KV cache configuration helps with:
+
+1. **Memory Management**: Control GPU memory allocation for the key-value cache through
+   `free_gpu_memory_fraction`, balancing memory between model weights and cache storage.
+
+2. **Block Reuse Optimization**: Enable `enable_block_reuse` to optimize memory usage
+   for shared prefixes across multiple requests, improving throughput for common prompts.
+
+3. **Performance Tuning**: Configure cache block sizes and total capacity to match
+   your workload characteristics (batch size, sequence length, and request patterns).
+
+Please refer to the `KvCacheConfig` API reference for more details.
+
+**CUDA Graph Configuration:**
+
+CUDA graphs help reduce kernel launch overhead and improve GPU utilization by capturing
+and replaying GPU operations. Benefits include:
+
+- Reduced kernel launch overhead for repeated operations
+- Better GPU utilization through optimized execution
+- Improved throughput for inference workloads
+
+Please refer to the `CudaGraphConfig` API reference for more details.
+
+**How to Run:**
+
+Run all examples:
+```bash
+python llm_runtime.py
+```
+
+Run specific example:
+```bash
+python llm_runtime.py --example kv_cache
+python llm_runtime.py --example cuda_graph
+```
+'''

 import argparse

@ -48,15 +92,19 @@ def example_cuda_graph_config():


 def example_kv_cache_config():
+    """Example demonstrating KV cache configuration for memory management and performance."""
    print("\n=== KV Cache Configuration Example ===")
    print("\n1. KV Cache Configuration:")

-    llm_advanced = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-                       max_batch_size=8,
-                       max_seq_len=1024,
-                       kv_cache_config=KvCacheConfig(
-                           free_gpu_memory_fraction=0.5,
-                           enable_block_reuse=True))
+    llm_advanced = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_batch_size=8,
+        max_seq_len=1024,
+        kv_cache_config=KvCacheConfig(
+            # free_gpu_memory_fraction: the fraction of free GPU memory to allocate to the KV cache
+            free_gpu_memory_fraction=0.5,
+            # enable_block_reuse: whether to enable block reuse
+            enable_block_reuse=True))

    prompts = [
        "Hello, my name is",