[docs]
@@ -2641,17 +2755,17 @@
cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size
if self.cuda_graph_config else
CudaGraphConfig.model_fields['max_batch_size'].default,
-
cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled
+
cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding
if self.cuda_graph_config else
-
CudaGraphConfig.model_fields['padding_enabled'].default,
+
CudaGraphConfig.model_fields['enable_padding'].default,
disable_overlap_scheduler=self.disable_overlap_scheduler,
-
moe_max_num_tokens=self.moe_max_num_tokens,
-
moe_load_balancer=self.moe_load_balancer,
+
moe_max_num_tokens=self.moe_config.max_num_tokens,
+
moe_load_balancer=self.moe_config.load_balancer,
attn_backend=self.attn_backend,
-
moe_backend=self.moe_backend,
+
moe_backend=self.moe_config.backend,
enable_mixed_sampler=self.enable_mixed_sampler,
enable_trtllm_sampler=self.enable_trtllm_sampler,
-
kv_cache_dtype=self.kv_cache_dtype,
+
kv_cache_dtype=self.kv_cache_config.dtype,
enable_iter_perf_stats=self.enable_iter_perf_stats,
enable_iter_req_stats=self.enable_iter_req_stats,
print_iter_log=self.print_iter_log,
@@ -2693,10 +2807,12 @@
"enable_build_cache": BuildCacheConfig,
"speculative_config": DecodingBaseConfig,
"lora_config": LoraConfig,
+
"moe_config": MoeConfig,
}
for field_name, field_type in field_mapping.items():
if field_name in llm_args_dict:
-
if field_name == "speculative_config":
+
# Some fields need to be converted manually.
+
if field_name in ["speculative_config", "build_config"]:
llm_args_dict[field_name] = field_type.from_dict(
llm_args_dict[field_name])
else:
@@ -2719,7 +2835,8 @@
return llm_args
-
def get_model_format(model_dir: str) -> _ModelFormatKind:
+
def get_model_format(model_dir: str,
+
trust_remote_code: bool = False) -> _ModelFormatKind:
''' Get the format of the model. '''
if not (Path(model_dir) / 'config.json').exists():
raise ValueError(
@@ -2738,7 +2855,8 @@
PretrainedConfig.from_checkpoint(model_dir)
else:
model_format = _ModelFormatKind.HF
-
AutoConfig.from_hugging_face(model_dir)
+
AutoConfig.from_hugging_face(model_dir,
+
trust_remote_code=trust_remote_code)
except Exception as e:
raise ValueError(
f"Inferred model format {model_format}, but failed to load config.json: {e}"
@@ -2865,9 +2983,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
index 3c326231d9..941ddbf5f9 100644
--- a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
+++ b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1133,9 +1137,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/baichuan/model.html b/latest/_modules/tensorrt_llm/models/baichuan/model.html
index 483d3c6fb6..6378f75d91 100644
--- a/latest/_modules/tensorrt_llm/models/baichuan/model.html
+++ b/latest/_modules/tensorrt_llm/models/baichuan/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -865,9 +869,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/bert/model.html b/latest/_modules/tensorrt_llm/models/bert/model.html
index f0824df253..32536aa4f4 100644
--- a/latest/_modules/tensorrt_llm/models/bert/model.html
+++ b/latest/_modules/tensorrt_llm/models/bert/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1169,9 +1173,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/bloom/model.html b/latest/_modules/tensorrt_llm/models/bloom/model.html
index b81800d495..5b2d2332a6 100644
--- a/latest/_modules/tensorrt_llm/models/bloom/model.html
+++ b/latest/_modules/tensorrt_llm/models/bloom/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -777,9 +781,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/config.html b/latest/_modules/tensorrt_llm/models/chatglm/config.html
index fc59eabb7c..40477cd892 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/config.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -794,9 +798,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/model.html b/latest/_modules/tensorrt_llm/models/chatglm/model.html
index 9b9be358c2..69f39b3eb0 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/model.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -993,9 +997,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/clip/model.html b/latest/_modules/tensorrt_llm/models/clip/model.html
index 42d1d9a3b5..bbf4beaa88 100644
--- a/latest/_modules/tensorrt_llm/models/clip/model.html
+++ b/latest/_modules/tensorrt_llm/models/clip/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -822,9 +826,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/config.html b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
index 38201d5dc8..0f2c69b962 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/config.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -653,9 +657,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/model.html b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
index f4633e3ef3..233fd5dccf 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/model.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -906,9 +910,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/commandr/model.html b/latest/_modules/tensorrt_llm/models/commandr/model.html
index ad0ee8c981..339d688a86 100644
--- a/latest/_modules/tensorrt_llm/models/commandr/model.html
+++ b/latest/_modules/tensorrt_llm/models/commandr/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -804,9 +808,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/config.html b/latest/_modules/tensorrt_llm/models/dbrx/config.html
index 637082d4fd..53a014b27d 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/config.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -668,9 +672,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/model.html b/latest/_modules/tensorrt_llm/models/dbrx/model.html
index 3a6c9d264b..7e5c23bbf0 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/model.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -794,9 +798,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
index 40ffa98b51..7c23d575dd 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -888,9 +892,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
index 049535076e..9c4ebca98a 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -970,9 +974,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dit/model.html b/latest/_modules/tensorrt_llm/models/dit/model.html
index e7b90678c3..a08a85df95 100644
--- a/latest/_modules/tensorrt_llm/models/dit/model.html
+++ b/latest/_modules/tensorrt_llm/models/dit/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1006,9 +1010,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/eagle/model.html b/latest/_modules/tensorrt_llm/models/eagle/model.html
index 1a91bec923..28243e7447 100644
--- a/latest/_modules/tensorrt_llm/models/eagle/model.html
+++ b/latest/_modules/tensorrt_llm/models/eagle/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1942,9 +1946,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/enc_dec/model.html b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
index 95c5a63cac..d1f69cd33e 100644
--- a/latest/_modules/tensorrt_llm/models/enc_dec/model.html
+++ b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -2847,9 +2851,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/falcon/config.html b/latest/_modules/tensorrt_llm/models/falcon/config.html
index 51f9b10586..d346f4990e 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/config.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -729,9 +733,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/falcon/model.html b/latest/_modules/tensorrt_llm/models/falcon/model.html
index 6374ad3629..97d0ab280d 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/model.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -891,9 +895,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gemma/config.html b/latest/_modules/tensorrt_llm/models/gemma/config.html
index 2926bb22ae..a669c1cb48 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/config.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -819,9 +823,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gemma/model.html b/latest/_modules/tensorrt_llm/models/gemma/model.html
index 63e2335e93..10b62d9194 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1011,9 +1015,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gpt/config.html b/latest/_modules/tensorrt_llm/models/gpt/config.html
index b09b236f65..bc728c9006 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/config.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -938,9 +942,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gpt/model.html b/latest/_modules/tensorrt_llm/models/gpt/model.html
index 17e189f65e..dd35605d00 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1041,9 +1045,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptj/config.html b/latest/_modules/tensorrt_llm/models/gptj/config.html
index 1de73f967b..f68503ac33 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/config.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -667,9 +671,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptj/model.html b/latest/_modules/tensorrt_llm/models/gptj/model.html
index b379d41dcc..d34465aae6 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -819,9 +823,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptneox/model.html b/latest/_modules/tensorrt_llm/models/gptneox/model.html
index 46048aa180..3002a01c6f 100644
--- a/latest/_modules/tensorrt_llm/models/gptneox/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptneox/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -759,9 +763,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/llama/config.html b/latest/_modules/tensorrt_llm/models/llama/config.html
index 93a5fa4f4e..f21edfb855 100644
--- a/latest/_modules/tensorrt_llm/models/llama/config.html
+++ b/latest/_modules/tensorrt_llm/models/llama/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -893,9 +897,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/llama/model.html b/latest/_modules/tensorrt_llm/models/llama/model.html
index 135386ee21..8a5c880341 100644
--- a/latest/_modules/tensorrt_llm/models/llama/model.html
+++ b/latest/_modules/tensorrt_llm/models/llama/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1241,9 +1245,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mamba/model.html b/latest/_modules/tensorrt_llm/models/mamba/model.html
index 4194308da6..25ec2f0d4f 100644
--- a/latest/_modules/tensorrt_llm/models/mamba/model.html
+++ b/latest/_modules/tensorrt_llm/models/mamba/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1086,9 +1090,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/medusa/config.html b/latest/_modules/tensorrt_llm/models/medusa/config.html
index c7af7ea999..ad43bc76eb 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/config.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -726,9 +730,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/medusa/model.html b/latest/_modules/tensorrt_llm/models/medusa/model.html
index 28c77eb356..c57f65164d 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/model.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -876,9 +880,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mllama/model.html b/latest/_modules/tensorrt_llm/models/mllama/model.html
index 1e311c88b5..725cd41566 100644
--- a/latest/_modules/tensorrt_llm/models/mllama/model.html
+++ b/latest/_modules/tensorrt_llm/models/mllama/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -2187,9 +2191,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
index 5173a9cb07..9c8f97fe9f 100644
--- a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
+++ b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1253,9 +1257,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/modeling_utils.html b/latest/_modules/tensorrt_llm/models/modeling_utils.html
index 710a207b17..93eb0e79b1 100644
--- a/latest/_modules/tensorrt_llm/models/modeling_utils.html
+++ b/latest/_modules/tensorrt_llm/models/modeling_utils.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -2651,9 +2655,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mpt/model.html b/latest/_modules/tensorrt_llm/models/mpt/model.html
index dd6bc00214..30ac332433 100644
--- a/latest/_modules/tensorrt_llm/models/mpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/mpt/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -791,9 +795,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
index e0c416a94a..02184aa633 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -725,9 +729,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
index aa96babe0a..22d655cf2f 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -793,9 +797,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/opt/model.html b/latest/_modules/tensorrt_llm/models/opt/model.html
index ab9ded1673..e8761dc6ed 100644
--- a/latest/_modules/tensorrt_llm/models/opt/model.html
+++ b/latest/_modules/tensorrt_llm/models/opt/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -796,9 +800,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/phi/model.html b/latest/_modules/tensorrt_llm/models/phi/model.html
index f6e9f26072..f0e2988dfb 100644
--- a/latest/_modules/tensorrt_llm/models/phi/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -840,9 +844,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/phi3/model.html b/latest/_modules/tensorrt_llm/models/phi3/model.html
index 4a9b9a9835..82c5509a66 100644
--- a/latest/_modules/tensorrt_llm/models/phi3/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi3/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -936,9 +940,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
index a6e15cd4fe..ffb43e398f 100644
--- a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1239,9 +1243,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/redrafter/model.html b/latest/_modules/tensorrt_llm/models/redrafter/model.html
index 731da9379d..a11f2860a2 100644
--- a/latest/_modules/tensorrt_llm/models/redrafter/model.html
+++ b/latest/_modules/tensorrt_llm/models/redrafter/model.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -926,9 +930,9 @@
diff --git a/latest/_modules/tensorrt_llm/plugin/plugin.html b/latest/_modules/tensorrt_llm/plugin/plugin.html
index 43e50723ec..8d7b7c10e0 100644
--- a/latest/_modules/tensorrt_llm/plugin/plugin.html
+++ b/latest/_modules/tensorrt_llm/plugin/plugin.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1467,9 +1471,9 @@
diff --git a/latest/_modules/tensorrt_llm/quantization/mode.html b/latest/_modules/tensorrt_llm/quantization/mode.html
index 3dbb6aff60..a3d72cc465 100644
--- a/latest/_modules/tensorrt_llm/quantization/mode.html
+++ b/latest/_modules/tensorrt_llm/quantization/mode.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1035,9 +1039,9 @@
diff --git a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
index 5569ac4315..23e0db329c 100644
--- a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
+++ b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1882,9 +1886,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
index 15f50880bb..874dae69ba 100644
--- a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1150,9 +1154,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/generation.html b/latest/_modules/tensorrt_llm/runtime/generation.html
index 7ec79e8376..c815b0e78e 100644
--- a/latest/_modules/tensorrt_llm/runtime/generation.html
+++ b/latest/_modules/tensorrt_llm/runtime/generation.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -5438,9 +5442,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
index 73c4318b58..e9c45f190f 100644
--- a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
+++ b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1097,9 +1101,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner.html b/latest/_modules/tensorrt_llm/runtime/model_runner.html
index 59571c49cb..050d936456 100644
--- a/latest/_modules/tensorrt_llm/runtime/model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1609,9 +1613,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
index dcc1acb717..8ac3029117 100644
--- a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1819,9 +1823,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
index 1dc785a2ff..4ac120b8c0 100644
--- a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -3217,7 +3221,7 @@
)
image = None
elif self.model_type in ['llava_onevision']:
-
pre_prompt = "<|im_start|>user "
+
pre_prompt = "<|im_start|>user " + "<video>" if self.args.video_path is not None else "<image>"
if input_text is None:
input_text = "Question: which city is this? Answer:" if self.args.video_path is None else "Why is this video funny?"
post_prompt = f"\n{input_text}<|im_end|><|im_start|>assistant\n"
@@ -3228,7 +3232,7 @@
text=prompt,
return_tensors="pt")
else:
-
image = self.processor(videos=raw_image,
+
image = self.processor(videos=list(raw_image),
text=prompt,
return_tensors="pt")
@@ -3408,9 +3412,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/session.html b/latest/_modules/tensorrt_llm/runtime/session.html
index 5b0116a6d3..2a782aa1b6 100644
--- a/latest/_modules/tensorrt_llm/runtime/session.html
+++ b/latest/_modules/tensorrt_llm/runtime/session.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -957,9 +961,9 @@
diff --git a/latest/_modules/tensorrt_llm/sampling_params.html b/latest/_modules/tensorrt_llm/sampling_params.html
index 2ac96044b4..65fe291de1 100644
--- a/latest/_modules/tensorrt_llm/sampling_params.html
+++ b/latest/_modules/tensorrt_llm/sampling_params.html
@@ -57,7 +57,7 @@
@@ -67,7 +67,7 @@
-
+
@@ -342,6 +342,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+
Runtime Configuration Examples
+
Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -377,6 +379,8 @@
Command-Line Reference
@@ -1086,9 +1090,9 @@
diff --git a/latest/_sources/_cpp_gen/executor.rst.txt b/latest/_sources/_cpp_gen/executor.rst.txt
index d3ca9cd473..6aa6d178e5 100644
--- a/latest/_sources/_cpp_gen/executor.rst.txt
+++ b/latest/_sources/_cpp_gen/executor.rst.txt
@@ -4,12 +4,30 @@ Executor
.. Here are files in the cpp/include/executor
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
+cacheCommunicator.h
+___________________
+
+.. doxygenfile:: cacheCommunicator.h
+ :project: TensorRT-LLM
+
+serialization.h
+_______________
+
+.. doxygenfile:: serialization.h
+ :project: TensorRT-LLM
+
disaggServerUtil.h
__________________
.. doxygenfile:: disaggServerUtil.h
:project: TensorRT-LLM
+dataTransceiverState.h
+______________________
+
+.. doxygenfile:: dataTransceiverState.h
+ :project: TensorRT-LLM
+
tensor.h
________
@@ -22,10 +40,10 @@ _______________
.. doxygenfile:: transferAgent.h
:project: TensorRT-LLM
-serialization.h
-_______________
+executor.h
+__________
-.. doxygenfile:: serialization.h
+.. doxygenfile:: executor.h
:project: TensorRT-LLM
types.h
@@ -34,21 +52,3 @@ _______
.. doxygenfile:: types.h
:project: TensorRT-LLM
-executor.h
-__________
-
-.. doxygenfile:: executor.h
- :project: TensorRT-LLM
-
-dataTransceiverState.h
-______________________
-
-.. doxygenfile:: dataTransceiverState.h
- :project: TensorRT-LLM
-
-cacheCommunicator.h
-___________________
-
-.. doxygenfile:: cacheCommunicator.h
- :project: TensorRT-LLM
-
diff --git a/latest/_sources/_cpp_gen/runtime.rst.txt b/latest/_sources/_cpp_gen/runtime.rst.txt
index 076debe93b..721113caf1 100644
--- a/latest/_sources/_cpp_gen/runtime.rst.txt
+++ b/latest/_sources/_cpp_gen/runtime.rst.txt
@@ -4,58 +4,22 @@ Runtime
.. Here are files in the cpp/include/runtime
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
-lookaheadBuffers.h
-__________________
-
-.. doxygenfile:: lookaheadBuffers.h
- :project: TensorRT-LLM
-
lookaheadModule.h
_________________
.. doxygenfile:: lookaheadModule.h
:project: TensorRT-LLM
-iBuffer.h
+decoderState.h
+______________
+
+.. doxygenfile:: decoderState.h
+ :project: TensorRT-LLM
+
+request.h
_________
-.. doxygenfile:: iBuffer.h
- :project: TensorRT-LLM
-
-modelConfig.h
-_____________
-
-.. doxygenfile:: modelConfig.h
- :project: TensorRT-LLM
-
-decodingOutput.h
-________________
-
-.. doxygenfile:: decodingOutput.h
- :project: TensorRT-LLM
-
-promptTuningParams.h
-____________________
-
-.. doxygenfile:: promptTuningParams.h
- :project: TensorRT-LLM
-
-bufferManager.h
-_______________
-
-.. doxygenfile:: bufferManager.h
- :project: TensorRT-LLM
-
-gptJsonConfig.h
-_______________
-
-.. doxygenfile:: gptJsonConfig.h
- :project: TensorRT-LLM
-
-runtimeDefaults.h
-_________________
-
-.. doxygenfile:: runtimeDefaults.h
+.. doxygenfile:: request.h
:project: TensorRT-LLM
loraCache.h
@@ -64,10 +28,34 @@ ___________
.. doxygenfile:: loraCache.h
:project: TensorRT-LLM
-rawEngine.h
-___________
+bufferManager.h
+_______________
-.. doxygenfile:: rawEngine.h
+.. doxygenfile:: bufferManager.h
+ :project: TensorRT-LLM
+
+memoryCounters.h
+________________
+
+.. doxygenfile:: memoryCounters.h
+ :project: TensorRT-LLM
+
+runtimeDefaults.h
+_________________
+
+.. doxygenfile:: runtimeDefaults.h
+ :project: TensorRT-LLM
+
+ipcUtils.h
+__________
+
+.. doxygenfile:: ipcUtils.h
+ :project: TensorRT-LLM
+
+tllmLogger.h
+____________
+
+.. doxygenfile:: tllmLogger.h
:project: TensorRT-LLM
gptDecoder.h
@@ -76,34 +64,16 @@ ____________
.. doxygenfile:: gptDecoder.h
:project: TensorRT-LLM
-eagleBuffers.h
-______________
+cudaEvent.h
+___________
-.. doxygenfile:: eagleBuffers.h
+.. doxygenfile:: cudaEvent.h
:project: TensorRT-LLM
-medusaModule.h
-______________
+modelConfig.h
+_____________
-.. doxygenfile:: medusaModule.h
- :project: TensorRT-LLM
-
-explicitDraftTokensBuffers.h
-____________________________
-
-.. doxygenfile:: explicitDraftTokensBuffers.h
- :project: TensorRT-LLM
-
-iTensor.h
-_________
-
-.. doxygenfile:: iTensor.h
- :project: TensorRT-LLM
-
-common.h
-________
-
-.. doxygenfile:: common.h
+.. doxygenfile:: modelConfig.h
:project: TensorRT-LLM
loraCachePageManagerConfig.h
@@ -118,22 +88,10 @@ _____________
.. doxygenfile:: worldConfig.h
:project: TensorRT-LLM
-loraModule.h
-____________
+eagleModule.h
+_____________
-.. doxygenfile:: loraModule.h
- :project: TensorRT-LLM
-
-speculativeDecodingMode.h
-_________________________
-
-.. doxygenfile:: speculativeDecodingMode.h
- :project: TensorRT-LLM
-
-cudaEvent.h
-___________
-
-.. doxygenfile:: cudaEvent.h
+.. doxygenfile:: eagleModule.h
:project: TensorRT-LLM
decodingInput.h
@@ -142,40 +100,10 @@ _______________
.. doxygenfile:: decodingInput.h
:project: TensorRT-LLM
-speculativeDecodingModule.h
-___________________________
+gptJsonConfig.h
+_______________
-.. doxygenfile:: speculativeDecodingModule.h
- :project: TensorRT-LLM
-
-iGptDecoderBatched.h
-____________________
-
-.. doxygenfile:: iGptDecoderBatched.h
- :project: TensorRT-LLM
-
-eagleModule.h
-_____________
-
-.. doxygenfile:: eagleModule.h
- :project: TensorRT-LLM
-
-tllmLogger.h
-____________
-
-.. doxygenfile:: tllmLogger.h
- :project: TensorRT-LLM
-
-gptDecoderBatched.h
-___________________
-
-.. doxygenfile:: gptDecoderBatched.h
- :project: TensorRT-LLM
-
-cudaStream.h
-____________
-
-.. doxygenfile:: cudaStream.h
+.. doxygenfile:: gptJsonConfig.h
:project: TensorRT-LLM
ipcNvlsMemory.h
@@ -190,27 +118,99 @@ ________________
.. doxygenfile:: samplingConfig.h
:project: TensorRT-LLM
-request.h
-_________
+gptDecoderBatched.h
+___________________
-.. doxygenfile:: request.h
+.. doxygenfile:: gptDecoderBatched.h
:project: TensorRT-LLM
-decoderState.h
+lookaheadBuffers.h
+__________________
+
+.. doxygenfile:: lookaheadBuffers.h
+ :project: TensorRT-LLM
+
+loraModule.h
+____________
+
+.. doxygenfile:: loraModule.h
+ :project: TensorRT-LLM
+
+promptTuningParams.h
+____________________
+
+.. doxygenfile:: promptTuningParams.h
+ :project: TensorRT-LLM
+
+speculativeDecodingMode.h
+_________________________
+
+.. doxygenfile:: speculativeDecodingMode.h
+ :project: TensorRT-LLM
+
+common.h
+________
+
+.. doxygenfile:: common.h
+ :project: TensorRT-LLM
+
+medusaModule.h
______________
-.. doxygenfile:: decoderState.h
+.. doxygenfile:: medusaModule.h
:project: TensorRT-LLM
-ipcUtils.h
-__________
-
-.. doxygenfile:: ipcUtils.h
- :project: TensorRT-LLM
-
-memoryCounters.h
+decodingOutput.h
________________
-.. doxygenfile:: memoryCounters.h
+.. doxygenfile:: decodingOutput.h
+ :project: TensorRT-LLM
+
+cudaStream.h
+____________
+
+.. doxygenfile:: cudaStream.h
+ :project: TensorRT-LLM
+
+eagleBuffers.h
+______________
+
+.. doxygenfile:: eagleBuffers.h
+ :project: TensorRT-LLM
+
+iGptDecoderBatched.h
+____________________
+
+.. doxygenfile:: iGptDecoderBatched.h
+ :project: TensorRT-LLM
+
+speculativeDecodingModule.h
+___________________________
+
+.. doxygenfile:: speculativeDecodingModule.h
+ :project: TensorRT-LLM
+
+explicitDraftTokensBuffers.h
+____________________________
+
+.. doxygenfile:: explicitDraftTokensBuffers.h
+ :project: TensorRT-LLM
+
+rawEngine.h
+___________
+
+.. doxygenfile:: rawEngine.h
+ :project: TensorRT-LLM
+
+iTensor.h
+_________
+
+.. doxygenfile:: iTensor.h
+ :project: TensorRT-LLM
+
+iBuffer.h
+_________
+
+.. doxygenfile:: iBuffer.h
:project: TensorRT-LLM
diff --git a/latest/_sources/advanced/disaggregated-service.md.txt b/latest/_sources/advanced/disaggregated-service.md.txt
index 757b1da81f..426d327c18 100644
--- a/latest/_sources/advanced/disaggregated-service.md.txt
+++ b/latest/_sources/advanced/disaggregated-service.md.txt
@@ -16,8 +16,6 @@ An [architectural and performance overview](../../../docs/source/blogs/tech_blog
TRT-LLM uses some environment variables to control the behavior of disaggregated service.
-* `TRTLLM_USE_UCX_KVCACHE`: Specifies whether to use UCX for KV cache transfer. The default value is `0`. This must be enabled when using a disaggregated service.
-
* `TRTLLM_PARALLEL_CACHE_SEND`: If set to `1`, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is `0`.
* `TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP`: If set to `1`, generationExecutor will not overlap KV cache transfer with model inference. The default value is `0`.
@@ -66,55 +64,19 @@ A. Yes, it's recommended that different executor use different GPUs . We support
*Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`*
-A. Please set the environment variables
-```
-export TRTLLM_USE_UCX_KVCACHE=1
+A. please set `backendType` of `CacheTransceiverConfig`.
+```cpp
+ExecutorConfig executorConfig{...};
+
+executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT));
```
-*Q. Why do some profiling tools show that TRT-LLM's KV cache transfer does not utilize NVLink even on devices equipped with NVLink?*
-
-A. Please check version of `UCX` with `ucx_info -v`.
-If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
-If the version of UCX >=1.18, there are several ways to enable NVLink:
-1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda`, `UCX_CUDA_COPY_DMABUF=no`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
-2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
+When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will transfer the KV cache using `CUDA-aware MPI`. All executor processes involved must share the same MPI world communicator. Consequently, with `TRTLLM_USE_MPI_KVCACHE=1`, TRT-LLM only supports launching multiple executors via `MPI`. Additionally, the `CommunicationMode` for the executors must be set to `kLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for the `disaggregated-service`. These restrictions do not apply when `TRTLLM_USE_UCX_KVCACHE=1` is set.
*Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?*
-A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:
-1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_RNDV_FRAG_MEM_TYPE=cuda`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
-2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
+A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.
-*Q. Are there any guidelines for performance tuning of KV cache transfer?*
+*Q. What causes the substantial bandwidth fluctuations in kvCache transfers, especially during the first few requests following service initialization?*
-A. Depending on the user's use case, certain sets of environment variables can help avoid poor KV cache transfer performance.
-
-Environment Variable Set A
-
-```
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
-export UCX_RNDV_FRAG_MEM_TYPES=cuda
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-```
-This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.
-
-Environment Variable Set B
-
-```
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
-export UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
-export UCX_CUDA_COPY_DMABUF=no
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-```
-Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.
-
-Environment Variable Set C
-
-```
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-```
-Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.
+A. The communication for kvCache transfer between executors are established dynamically. The connection establishment process incurs significant overhead, which explains the apparently lower kvCache transfer bandwidth observed during the initial requests after service startup. This lower bandwidth reflects the inclusion of connection establishment overhead. When conducting benchmarks, it is recommended to perform a warm-up phase to ensure accurate performance measurements.
diff --git a/latest/_sources/advanced/speculative-decoding.md.txt b/latest/_sources/advanced/speculative-decoding.md.txt
index a601d9dd24..85a87ae062 100644
--- a/latest/_sources/advanced/speculative-decoding.md.txt
+++ b/latest/_sources/advanced/speculative-decoding.md.txt
@@ -3,13 +3,14 @@
- [About Speculative Sampling](#about-speculative-sampling)
- [Performance Improvements](#Performance-improvements)
- [Draft-Target-Model](#Draft-Target-Model)
-- [Prompt-Lookup-Decoding](#prompt-lookup-decoding)
+- [NGram](#ngram)
- [Medusa](#medusa)
- [Medusa Tree](#medusa-tree)
- [Using Medusa with TensorRT-LLM](#using-medusa-with-tensorrt-llm)
- [Limitations](#limitations)
- [ReDrafter](#redrafter)
- [EAGLE](#eagle)
+ - [Disaggregated Serving](#disaggregated-serving)
- [Lookahead decoding](#lookahead-decoding)
## About Speculative Sampling
@@ -35,7 +36,7 @@ TensorRT-LLM supports several approaches for generating draft tokens, including:
1. [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774).
2. [Recurrent Drafter for Fast Speculative Decoding in Large Language Models](https://arxiv.org/html/2403.09919v1).
3. [EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty](https://arxiv.org/pdf/2401.15077).
-3. Utilizing prompt tokens as draft tokens. For more information, refer to [Prompt Lookup Decoding](https://github.com/apoorvumang/prompt-lookup-decoding/).
+3. Utilizing prompt tokens as draft tokens. For more information, refer to [NGram](https://github.com/apoorvumang/prompt-lookup-decoding/).
4. Utilizing Jacobi-like decoding to predict and verify draft tokens using the same model which does not need additional fine-tuning. Refer to [Break the Sequential Dependency of LLM Inference Using Lookahead Decoding](https://arxiv.org/pdf/2402.02057).
@@ -61,13 +62,13 @@ Subsequently, the prompt, now updated with the accepted tokens, is sent back to
This iterative process continues until a predefined stop conditions are met.
An example of this orchestration process can be found in the [TensorRT-LLM Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py).
-We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in [examples/draft_target_model/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/draft_target_model/README.md) and the code can be found in [examples/prompt_lookup/run_dtm_pld.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/run_dtm_pld.py).
+We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in [examples/draft_target_model/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/draft_target_model/README.md) and the code can be found in [examples/ngram/run_dtm_ngram.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/run_dtm_ngram.py).
-## Prompt-Lookup-Decoding
+## NGram
-The Prompt-Lookup speculative decoding directly copies from the input prompt and previous generated output as draft tokens while generating the later output. It works like Draft-Target-Model but involves only one Target LLM model without further fine-tuning. The Prompt-Lookup profit from the scenarios which have high n-gram overlap between input prompt and output, such as summarization, document QA, multi-turn chat, code editing, etc.
+The NGram speculative decoding directly copies from the input prompt and previous generated output as draft tokens while generating the later output. It works like Draft-Target-Model but involves only one Target LLM model without further fine-tuning. The NGram profit from the scenarios which have high n-gram overlap between input prompt and output, such as summarization, document QA, multi-turn chat, code editing, etc.
-See document in [examples/prompt_lookup/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/README.md) and the code can be found in [examples/prompt_lookup/run_dtm_pld.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/run_dtm_pld.py).
+See document in [examples/ngram/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/README.md) and the code can be found in [examples/ngram/run_dtm_ngram.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/run_dtm_ngram.py).
## Medusa
@@ -169,6 +170,10 @@ The EAGLE approach enhances the single-model Medusa method by predicting and ver
Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine. EAGLE-1 and EAGLE-2 are both supported, while EAGLE-2 is currently in the experimental stage. Please, visit the [EAGLE README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/eagle/README.md) for information about building and running the model.
+### Disaggregated Serving
+
+[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) with EAGLE3 using the two model approach is supported in the Pytorch backend. Please refer to the following [Dynamo example](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/llama4_plus_eagle.md) on how to run EAGLE3 with Disaggregated Serving for Llama 4 Maverick.
+
## Lookahead Decoding
Lookahead decoding algorithm operates through two parallel computation branches within the same model: a lookahead branch that generates n-grams using a fixed-sized 2D window, and a verification branch that validates promising n-gram candidates. This approach eliminates the necessity for additional model training or fine-tuning and can be enabled for any autoregressive model. Refer to the [Lookahead decoding README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/lookahead/README.md) for information about building and running the model.
diff --git a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
index f17caefc44..98c72e700d 100644
--- a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
+++ b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
@@ -138,7 +138,8 @@ YOUR_DATA_PATH=
cat >./extra-llm-api-config.yml<./extra-llm-api-config.yml <./extra-llm-api-config.yml <
cat >./extra-llm-api-config.yml<
cat >./extra-llm-api-config.yml< ./extra_llm_api_options_eplb.yaml <`_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports:
+
+**Dataset Types:**
+
+- Real datasets from various sources
+- Synthetic datasets with normal or uniform token distributions
+- LoRA task-specific datasets
+
+**Key Features:**
+
+- Tokenizer integration for proper text preprocessing
+- Configurable random seeds for reproducible results
+- Support for LoRA adapters and task IDs
+- Output in JSON format compatible with trtllm-bench
+
+.. important::
+ The ``--stdout`` flag is **required** when using prepare_dataset.py with trtllm-bench to ensure proper data streaming format.
+
+**Usage:**
+
+prepare_dataset
+-------------------
+
+.. code-block:: bash
+
+ python prepare_dataset.py [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - ``--tokenizer``
+ - Tokenizer directory or HuggingFace model name (required)
+ * - ``--output``
+ - Output JSON filename (default: preprocessed_dataset.json)
+ * - ``--stdout``
+ - Print output to stdout with JSON dataset entry on each line (**required for trtllm-bench**)
+ * - ``--random-seed``
+ - Random seed for token generation (default: 420)
+ * - ``--task-id``
+ - LoRA task ID (default: -1)
+ * - ``--rand-task-id``
+ - Random LoRA task range (two integers)
+ * - ``--lora-dir``
+ - Directory containing LoRA adapters
+ * - ``--log-level``
+ - Logging level: info or debug (default: info)
+
+dataset
+-------------------
+
+Process real datasets from various sources.
+
+.. code-block:: bash
+
+ python prepare_dataset.py dataset [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - ``--input``
+ - Input dataset file or directory (required)
+ * - ``--max-input-length``
+ - Maximum input sequence length (default: 2048)
+ * - ``--max-output-length``
+ - Maximum output sequence length (default: 512)
+ * - ``--num-samples``
+ - Number of samples to process (default: all)
+ * - ``--format``
+ - Input format: json, jsonl, csv, or txt (default: auto-detect)
+
+
+token_norm_dist
+-------------------
+
+Generate synthetic datasets with normal token distribution.
+
+.. code-block:: bash
+
+ python prepare_dataset.py token_norm_dist [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - ``--num-requests``
+ - Number of requests to be generated (required)
+ * - ``--input-mean``
+ - Normal distribution mean for input tokens (required)
+ * - ``--input-stdev``
+ - Normal distribution standard deviation for input tokens (required)
+ * - ``--output-mean``
+ - Normal distribution mean for output tokens (required)
+ * - ``--output-stdev``
+ - Normal distribution standard deviation for output tokens (required)
+
+
+token_unif_dist
+-------------------
+
+Generate synthetic datasets with uniform token distribution
+
+.. code-block:: bash
+
+ python prepare_dataset.py token_unif_dist [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+ :widths: 20 80
+ :header-rows: 1
+
+ * - Option
+ - Description
+ * - ``--num-requests``
+ - Number of requests to be generated (required)
+ * - ``--input-min``
+ - Uniform distribution minimum for input tokens (required)
+ * - ``--input-max``
+ - Uniform distribution maximum for input tokens (required)
+ * - ``--output-min``
+ - Uniform distribution minimum for output tokens (required)
+ * - ``--output-max``
+ - Uniform distribution maximum for output tokens (required)
diff --git a/latest/_sources/examples/llm_api_examples.rst.txt b/latest/_sources/examples/llm_api_examples.rst.txt
index 8af8031907..969f88456b 100644
--- a/latest/_sources/examples/llm_api_examples.rst.txt
+++ b/latest/_sources/examples/llm_api_examples.rst.txt
@@ -22,6 +22,8 @@ _____________
llm_logits_processor
llm_multilora
llm_speculative_decoding
+ llm_runtime
+ llm_sampling
Slurm
_____
diff --git a/latest/_sources/examples/llm_guided_decoding.rst.txt b/latest/_sources/examples/llm_guided_decoding.rst.txt
index c743db7c46..8f41b34b24 100644
--- a/latest/_sources/examples/llm_guided_decoding.rst.txt
+++ b/latest/_sources/examples/llm_guided_decoding.rst.txt
@@ -3,6 +3,6 @@ Generate text with guided decoding
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_guided_decoding.py.
.. literalinclude:: ../../../examples/llm-api/llm_guided_decoding.py
- :lines: 4-50
+ :lines: 4-47
:language: python
:linenos:
diff --git a/latest/_sources/examples/llm_runtime.rst.txt b/latest/_sources/examples/llm_runtime.rst.txt
new file mode 100644
index 0000000000..163be13f79
--- /dev/null
+++ b/latest/_sources/examples/llm_runtime.rst.txt
@@ -0,0 +1,8 @@
+Runtime Configuration Examples
+==============================
+Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_runtime.py.
+
+.. literalinclude:: ../../../examples/llm-api/llm_runtime.py
+ :lines: 4-97
+ :language: python
+ :linenos:
diff --git a/latest/_sources/examples/llm_sampling.rst.txt b/latest/_sources/examples/llm_sampling.rst.txt
new file mode 100644
index 0000000000..75dd5f8d81
--- /dev/null
+++ b/latest/_sources/examples/llm_sampling.rst.txt
@@ -0,0 +1,8 @@
+Sampling Techniques Showcase
+============================
+Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_sampling.py.
+
+.. literalinclude:: ../../../examples/llm-api/llm_sampling.py
+ :lines: 4-229
+ :language: python
+ :linenos:
diff --git a/latest/_sources/index.rst.txt b/latest/_sources/index.rst.txt
index b63ec95a67..50b9c12267 100644
--- a/latest/_sources/index.rst.txt
+++ b/latest/_sources/index.rst.txt
@@ -77,6 +77,7 @@ Welcome to TensorRT-LLM's Documentation!
:caption: Command-Line Reference
:hidden:
+ commands/trtllm-bench
commands/trtllm-build
commands/trtllm-serve
diff --git a/latest/_sources/llm-api/reference.rst.txt b/latest/_sources/llm-api/reference.rst.txt
index 825c662c66..cdef5974e9 100644
--- a/latest/_sources/llm-api/reference.rst.txt
+++ b/latest/_sources/llm-api/reference.rst.txt
@@ -55,6 +55,12 @@ API Reference
:show-inheritance:
:special-members: __init__
+.. autoclass:: tensorrt_llm.llmapi.MoeConfig
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :special-members: __init__
+
.. autoclass:: tensorrt_llm.llmapi.LookaheadDecodingConfig
:members:
:undoc-members:
diff --git a/latest/_sources/performance/perf-overview.md.txt b/latest/_sources/performance/perf-overview.md.txt
index 05c4918db5..3f55a4e109 100644
--- a/latest/_sources/performance/perf-overview.md.txt
+++ b/latest/_sources/performance/perf-overview.md.txt
@@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
`llm_options.yml`
```yaml
cuda_graph_config:
- padding_enabled: true
+ enable_padding: true
batch_sizes:
- 1
- 2
diff --git a/latest/_sources/reference/ci-overview.md.txt b/latest/_sources/reference/ci-overview.md.txt
index 9002ae6ab3..30cc613a2e 100644
--- a/latest/_sources/reference/ci-overview.md.txt
+++ b/latest/_sources/reference/ci-overview.md.txt
@@ -55,9 +55,27 @@ The array elements are: GPU type, YAML file (without extension), shard index, an
2. Search `jenkins/L0_Test.groovy` for a stage whose YAML file matches (for example `l0_a100`) and whose name contains `[Post-Merge]` if the YAML entry uses `stage: post_merge`.
3. The resulting stage name(s) are what you pass to Jenkins via the `stage_list` parameter when triggering a job.
-### Example
+### Using `test_to_stage_mapping.py`
+
+Manually searching YAML and Groovy files can be tedious. The helper script
+`scripts/test_to_stage_mapping.py` automates the lookup:
+
+```bash
+python scripts/test_to_stage_mapping.py --tests "triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]"
+python scripts/test_to_stage_mapping.py --tests gpt_ib_ptuning
+python scripts/test_to_stage_mapping.py --stages A100X-Triton-Post-Merge-1
+python scripts/test_to_stage_mapping.py --test-list my_tests.txt
+python scripts/test_to_stage_mapping.py --test-list my_tests.yml
+```
+
+The first two commands print the Jenkins stages that run the specified tests or
+patterns. Patterns are matched by substring, so partial test names are
+supported out of the box. The third lists every test executed in the given stage. When
+providing tests on the command line, quote each test string so the shell does
+not interpret the `[` and `]` characters as globs. Alternatively, store the
+tests in a newline‑separated text file or a YAML list and supply it with
+`--test-list`.
-`triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]` appears in `l0_a100.yml` under `stage: post_merge` and `backend: triton`. The corresponding Jenkins stages are `A100X-Triton-[Post-Merge]-1` and `A100X-Triton-[Post-Merge]-2` (two shards).
To run the same tests on your pull request, comment:
@@ -67,6 +85,7 @@ To run the same tests on your pull request, comment:
This executes the same tests that run post-merge for this hardware/backend.
+
## Waiving tests
Sometimes a test is known to fail due to a bug or unsupported feature. Instead
diff --git a/latest/_sources/reference/support-matrix.md.txt b/latest/_sources/reference/support-matrix.md.txt
index a2c1718b0d..37fada2c0d 100644
--- a/latest/_sources/reference/support-matrix.md.txt
+++ b/latest/_sources/reference/support-matrix.md.txt
@@ -123,6 +123,7 @@ In addition, older architectures can have limitations for newer software release
- TensorRT-LLM requires Linux x86_64 or Linux aarch64.
* - GPU Model Architectures
-
+ - [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/)
- [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
- [NVIDIA Grace Hopper Superchip](https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/)
- [NVIDIA Hopper Architecture](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/)
diff --git a/latest/_sources/torch/arch_overview.md.txt b/latest/_sources/torch/arch_overview.md.txt
index 11b12781ce..ec7f6e51ab 100644
--- a/latest/_sources/torch/arch_overview.md.txt
+++ b/latest/_sources/torch/arch_overview.md.txt
@@ -37,7 +37,7 @@ The single-step flow of PyExecutor involves:
The core component of `PyExecutor` is the `ModelEngine`, responsible for executing the model's forward pass efficiently on the GPU.
The key method of `ModelEngine` is `forward`, which handles the forward pass computation.
-For the PyTorch backend, the derived class is `PyTorchModelEngine`, declared in [pytorch_model_engine.py](../../../tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py).
+For the PyTorch backend, the derived class is `PyTorchModelEngine`, declared in [model_engine.py](../../../tensorrt_llm/_torch/pyexecutor/model_engine.py).
## Decoder
diff --git a/latest/_sources/torch/features/feature_combination_matrix.md.txt b/latest/_sources/torch/features/feature_combination_matrix.md.txt
index 8f8d5defe8..f62c1d33aa 100644
--- a/latest/_sources/torch/features/feature_combination_matrix.md.txt
+++ b/latest/_sources/torch/features/feature_combination_matrix.md.txt
@@ -15,4 +15,4 @@
| KV Cache Reuse | Yes | Yes | Yes | Untested | Untested | Untested | Yes | No | Yes | Yes | --- | | | |
| Slide Window Attention | Yes | Yes | Yes | Untested | Untested | Untested | Untested | Untested | Yes | Yes | WIP | --- | | |
| Logits Post Processor | No | Yes | Yes | No | Untested | No | No | No | Yes | Yes | Yes | Yes | --- | |
-| Guided Decoding | No | Yes | Yes | Untested | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |
+| Guided Decoding | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |
diff --git a/latest/advanced/disaggregated-service.html b/latest/advanced/disaggregated-service.html
index 7b51649634..d175b6ba04 100644
--- a/latest/advanced/disaggregated-service.html
+++ b/latest/advanced/disaggregated-service.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
Control generated text using logits processor
Generate text with multiple LoRA adapters
Speculative Decoding
+Runtime Configuration Examples
+Sampling Techniques Showcase
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -509,7 +513,6 @@ This feature is currently experimental, and the related API is subjected to chan
Environment Variables
TRT-LLM uses some environment variables to control the behavior of disaggregated service.
-TRTLLM_USE_UCX_KVCACHE: Specifies whether to use UCX for KV cache transfer. The default value is 0. This must be enabled when using a disaggregated service.
TRTLLM_PARALLEL_CACHE_SEND: If set to 1, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is 0.
TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: If set to 1, generationExecutor will not overlap KV cache transfer with model inference. The default value is 0.
TRTLLM_ENABLE_KVCACHE_RECEIVE_PARALLEL: When the generation rank receives KV cache from multiple context ranks within a single context instance, it will receive KV cache from each rank sequentially. If set to 1, the generation rank will receive KV cache from each rank within one context instance in parallel. The default value is 0.
@@ -540,50 +543,17 @@ This feature is currently experimental, and the related API is subjected to chan
Debugging FAQs
Q. How to handle error Disaggregated serving is not enabled, please check the configuration?
-A. Please set the environment variables
-export TRTLLM_USE_UCX_KVCACHE=1
+A. please set backendType of CacheTransceiverConfig.
+ExecutorConfig executorConfig{...};
+
+executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT));
-Q. Why do some profiling tools show that TRT-LLM’s KV cache transfer does not utilize NVLink even on devices equipped with NVLink?
-A. Please check version of UCX with ucx_info -v.
-If the version of UCX <=1.17, set the environment variables UCX_RNDV_FRAG_MEM_TYPE=cuda and UCX_MEMTYPE_CACHE=n to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
-If the version of UCX >=1.18, there are several ways to enable NVLink:
-
-Set the environment variables TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B,UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda, UCX_CUDA_COPY_DMABUF=no, UCX_MEMTYPE_CACHE=n and UCX_RNDV_PIPELINE_ERROR_HANDLING=y.
-Set the environment variables TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size, UCX_MEMTYPE_CACHE=n and UCX_RNDV_PIPELINE_ERROR_HANDLING=y. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
-
+When the environment variable TRTLLM_USE_MPI_KVCACHE=1 is set, TRT-LLM will transfer the KV cache using CUDA-aware MPI. All executor processes involved must share the same MPI world communicator. Consequently, with TRTLLM_USE_MPI_KVCACHE=1, TRT-LLM only supports launching multiple executors via MPI. Additionally, the CommunicationMode for the executors must be set to kLEADER or kORCHESTRATOR with SpawnProcesses=false for the disaggregated-service. These restrictions do not apply when TRTLLM_USE_UCX_KVCACHE=1 is set.
Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?
-A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:
-
-Set the environment variables TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B,UCX_RNDV_FRAG_MEM_TYPE=cuda, UCX_MEMTYPE_CACHE=n and UCX_RNDV_PIPELINE_ERROR_HANDLING=y.
-Set the environment variables TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size, UCX_MEMTYPE_CACHE=n and UCX_RNDV_PIPELINE_ERROR_HANDLING=y, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
-
-Q. Are there any guidelines for performance tuning of KV cache transfer?
-A. Depending on the user’s use case, certain sets of environment variables can help avoid poor KV cache transfer performance.
-Environment Variable Set A
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
-export UCX_RNDV_FRAG_MEM_TYPES=cuda
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-
-
-This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.
-Environment Variable Set B
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
-export UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
-export UCX_CUDA_COPY_DMABUF=no
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-
-
-Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.
-Environment Variable Set C
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-
-
-Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.
+A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.
+Q. What causes the substantial bandwidth fluctuations in kvCache transfers, especially during the first few requests following service initialization?
+A. The communication for kvCache transfer between executors are established dynamically. The connection establishment process incurs significant overhead, which explains the apparently lower kvCache transfer bandwidth observed during the initial requests after service startup. This lower bandwidth reflects the inclusion of connection establishment overhead. When conducting benchmarks, it is recommended to perform a warm-up phase to ensure accurate performance measurements.
@@ -737,9 +707,9 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
diff --git a/latest/advanced/executor.html b/latest/advanced/executor.html
index 576e1fc49b..169e3bd856 100644
--- a/latest/advanced/executor.html
+++ b/latest/advanced/executor.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -798,9 +802,9 @@ the TensorRT-LLM C++ Executor API.
diff --git a/latest/advanced/expert-parallelism.html b/latest/advanced/expert-parallelism.html
index 5ae2f9035d..9694b1a6ce 100644
--- a/latest/advanced/expert-parallelism.html
+++ b/latest/advanced/expert-parallelism.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -665,9 +669,9 @@
diff --git a/latest/advanced/gpt-attention.html b/latest/advanced/gpt-attention.html
index 156291801d..8d873ea3a5 100644
--- a/latest/advanced/gpt-attention.html
+++ b/latest/advanced/gpt-attention.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -983,9 +987,9 @@ is computed as:
diff --git a/latest/advanced/gpt-runtime.html b/latest/advanced/gpt-runtime.html
index 5c2e8084e2..1e47fb9250 100644
--- a/latest/advanced/gpt-runtime.html
+++ b/latest/advanced/gpt-runtime.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -1024,9 +1028,9 @@ The
GptDecoder
diff --git a/latest/advanced/graph-rewriting.html b/latest/advanced/graph-rewriting.html
index d52266e9b5..4530a7c4fb 100644
--- a/latest/advanced/graph-rewriting.html
+++ b/latest/advanced/graph-rewriting.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -844,9 +848,9 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
diff --git a/latest/advanced/kv-cache-management.html b/latest/advanced/kv-cache-management.html
index 8f14cc4655..046e3e9f96 100644
--- a/latest/advanced/kv-cache-management.html
+++ b/latest/advanced/kv-cache-management.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -749,9 +753,9 @@ An “event” is any significant change in the lifecycle or state of a KV cache
diff --git a/latest/advanced/kv-cache-reuse.html b/latest/advanced/kv-cache-reuse.html
index d9c9b24349..4f40cd2001 100644
--- a/latest/advanced/kv-cache-reuse.html
+++ b/latest/advanced/kv-cache-reuse.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -722,9 +726,9 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
diff --git a/latest/advanced/lora.html b/latest/advanced/lora.html
index a860c1d000..06c1fd5079 100644
--- a/latest/advanced/lora.html
+++ b/latest/advanced/lora.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+
- Runtime Configuration Examples
+
- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -836,9 +840,9 @@ The shape of
LoraWe
diff --git a/latest/advanced/lowprecision-pcie-allreduce.html b/latest/advanced/lowprecision-pcie-allreduce.html
index c9cac0d684..71d4499586 100644
--- a/latest/advanced/lowprecision-pcie-allreduce.html
+++ b/latest/advanced/lowprecision-pcie-allreduce.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@
-
+
@@ -347,6 +347,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+- Runtime Configuration Examples
+- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -382,6 +384,8 @@
Command-Line Reference
@@ -685,9 +689,9 @@ This feature is optimized for PCIe-based GPU topologies and may affect model acc
diff --git a/latest/advanced/open-sourced-cutlass-kernels.html b/latest/advanced/open-sourced-cutlass-kernels.html
index a35b266658..81223b55e2 100644
--- a/latest/advanced/open-sourced-cutlass-kernels.html
+++ b/latest/advanced/open-sourced-cutlass-kernels.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@
-
+
@@ -347,6 +347,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+- Runtime Configuration Examples
+- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -382,6 +384,8 @@
Command-Line Reference
@@ -640,9 +644,9 @@ Note that support for these static libraries will be gradually deprioritized in
diff --git a/latest/advanced/speculative-decoding.html b/latest/advanced/speculative-decoding.html
index 48cc11f85c..1fa59bf9a8 100644
--- a/latest/advanced/speculative-decoding.html
+++ b/latest/advanced/speculative-decoding.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -349,6 +349,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+- Runtime Configuration Examples
+- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -384,6 +386,8 @@
Command-Line Reference
@@ -501,7 +505,7 @@
About Speculative Sampling
Performance Improvements
Draft-Target-Model
-Prompt-Lookup-Decoding
+NGram
Medusa
ReDrafter
-EAGLE
+EAGLE
+
+
Lookahead decoding
@@ -563,12 +571,12 @@ Upon verification, the Target model may return up to TensorRT-LLM Triton backend.
-We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in examples/draft_target_model/README.md and the code can be found in examples/prompt_lookup/run_dtm_pld.py.
+We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in examples/draft_target_model/README.md and the code can be found in examples/ngram/run_dtm_ngram.py.
-
-Prompt-Lookup-Decoding
-The Prompt-Lookup speculative decoding directly copies from the input prompt and previous generated output as draft tokens while generating the later output. It works like Draft-Target-Model but involves only one Target LLM model without further fine-tuning. The Prompt-Lookup profit from the scenarios which have high n-gram overlap between input prompt and output, such as summarization, document QA, multi-turn chat, code editing, etc.
-See document in examples/prompt_lookup/README.md and the code can be found in examples/prompt_lookup/run_dtm_pld.py.
+
+NGram
+The NGram speculative decoding directly copies from the input prompt and previous generated output as draft tokens while generating the later output. It works like Draft-Target-Model but involves only one Target LLM model without further fine-tuning. The NGram profit from the scenarios which have high n-gram overlap between input prompt and output, such as summarization, document QA, multi-turn chat, code editing, etc.
+See document in examples/ngram/README.md and the code can be found in examples/ngram/run_dtm_ngram.py.
Medusa
@@ -659,6 +667,10 @@ However, similar to any new model, you can follow the same approach to define yo
EAGLE
The EAGLE approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. Similarly to ReDrafter, it predicts draft tokens using a recurrent predictor where each draft token depends on the previous one. However, unlike ReDrafter, it uses a single-layer transformer model to predict draft tokens from previous hidden states and decoded tokens. In the EAGLE-1 decoding tree needs to be known during the decoding. In the EAGLE-2 this tree is asssembled during the execution by searching for the most probable hypothesis along the beam.
Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine. EAGLE-1 and EAGLE-2 are both supported, while EAGLE-2 is currently in the experimental stage. Please, visit the EAGLE README for information about building and running the model.
+
+Disaggregated Serving
+Disaggregated Serving with EAGLE3 using the two model approach is supported in the Pytorch backend. Please refer to the following Dynamo example on how to run EAGLE3 with Disaggregated Serving for Llama 4 Maverick.
+
@@ -824,9 +839,9 @@ However, similar to any new model, you can follow the same approach to define yo
diff --git a/latest/advanced/weight-streaming.html b/latest/advanced/weight-streaming.html
index f1e877d015..df3f34f996 100644
--- a/latest/advanced/weight-streaming.html
+++ b/latest/advanced/weight-streaming.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@
-
+
@@ -347,6 +347,8 @@
- Control generated text using logits processor
- Generate text with multiple LoRA adapters
- Speculative Decoding
+- Runtime Configuration Examples
+- Sampling Techniques Showcase
- Run LLM-API with pytorch backend on Slurm
- Run trtllm-bench with pytorch backend on Slurm
- Run trtllm-serve with pytorch backend on Slurm
@@ -382,6 +384,8 @@
Command-Line Reference
@@ -673,9 +677,9 @@ python3 examples/summarize.py
diff --git a/latest/architecture/add-model.html b/latest/architecture/add-model.html
index 6ab2d034bd..931cc7c743 100644
--- a/latest/architecture/add-model.html
+++ b/latest/architecture/add-model.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
--engine_di
diff --git a/latest/architecture/checkpoint.html b/latest/architecture/checkpoint.html
index 059b5ae264..0e4f536f88 100644
--- a/latest/architecture/checkpoint.html
+++ b/latest/architecture/checkpoint.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
./op
diff --git a/latest/architecture/core-concepts.html b/latest/architecture/core-concepts.html
index 61004ee758..dcc28ba03a 100644
--- a/latest/architecture/core-concepts.html
+++ b/latest/architecture/core-concepts.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
diff --git a/latest/architecture/model-weights-loader.html b/latest/architecture/model-weights-loader.html
index ba4bee3621..9e86d7083a 100644
--- a/latest/architecture/model-weights-loader.html
+++ b/latest/architecture/model-weights-loader.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@