[docs]
def get_executor_config(
@@ -4087,9 +4245,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html b/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html
index 75c2a35120..8193bf85e3 100644
--- a/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html
+++ b/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -778,9 +780,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
index 1a5bbbee79..57496f23c0 100644
--- a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
+++ b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1248,9 +1250,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/baichuan/model.html b/latest/_modules/tensorrt_llm/models/baichuan/model.html
index 1feedcf89d..17b9618c91 100644
--- a/latest/_modules/tensorrt_llm/models/baichuan/model.html
+++ b/latest/_modules/tensorrt_llm/models/baichuan/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -886,9 +888,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/bert/model.html b/latest/_modules/tensorrt_llm/models/bert/model.html
index ff8bca7065..9c5e8ca7ef 100644
--- a/latest/_modules/tensorrt_llm/models/bert/model.html
+++ b/latest/_modules/tensorrt_llm/models/bert/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1190,9 +1192,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/bloom/model.html b/latest/_modules/tensorrt_llm/models/bloom/model.html
index bdfef77765..52a5820861 100644
--- a/latest/_modules/tensorrt_llm/models/bloom/model.html
+++ b/latest/_modules/tensorrt_llm/models/bloom/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -798,9 +800,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/config.html b/latest/_modules/tensorrt_llm/models/chatglm/config.html
index 24e4fa539e..fa1b3c6424 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/config.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -815,9 +817,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/model.html b/latest/_modules/tensorrt_llm/models/chatglm/model.html
index 3ad79c2e00..ac8294f98c 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/model.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1014,9 +1016,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/clip/model.html b/latest/_modules/tensorrt_llm/models/clip/model.html
index 3fa05c6d7a..054e22a83e 100644
--- a/latest/_modules/tensorrt_llm/models/clip/model.html
+++ b/latest/_modules/tensorrt_llm/models/clip/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -843,9 +845,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/config.html b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
index 964bf3dc22..75a7de91e7 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/config.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -674,9 +676,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/model.html b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
index 1a095e6404..4d2557aec9 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/model.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -927,9 +929,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/commandr/model.html b/latest/_modules/tensorrt_llm/models/commandr/model.html
index c005f54ad6..cb0c864cd7 100644
--- a/latest/_modules/tensorrt_llm/models/commandr/model.html
+++ b/latest/_modules/tensorrt_llm/models/commandr/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -825,9 +827,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/config.html b/latest/_modules/tensorrt_llm/models/dbrx/config.html
index 33668b804a..9a7e917fea 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/config.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -689,9 +691,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/model.html b/latest/_modules/tensorrt_llm/models/dbrx/model.html
index 1f7d28a8c7..c058b28af2 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/model.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -815,9 +817,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
index 58859057d6..4ef0de9bfe 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -909,9 +911,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
index b8f43dd0ac..14b01a9e89 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -991,9 +993,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dit/model.html b/latest/_modules/tensorrt_llm/models/dit/model.html
index 79248a4f0f..3b1edabc97 100644
--- a/latest/_modules/tensorrt_llm/models/dit/model.html
+++ b/latest/_modules/tensorrt_llm/models/dit/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1027,9 +1029,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/eagle/model.html b/latest/_modules/tensorrt_llm/models/eagle/model.html
index 746b54b9c6..72268516ce 100644
--- a/latest/_modules/tensorrt_llm/models/eagle/model.html
+++ b/latest/_modules/tensorrt_llm/models/eagle/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1963,9 +1965,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/enc_dec/model.html b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
index 29b872204a..5670bcb067 100644
--- a/latest/_modules/tensorrt_llm/models/enc_dec/model.html
+++ b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -2870,9 +2872,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/falcon/config.html b/latest/_modules/tensorrt_llm/models/falcon/config.html
index 1544f9b733..a517fbaa87 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/config.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -750,9 +752,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/falcon/model.html b/latest/_modules/tensorrt_llm/models/falcon/model.html
index 2cb24e48cd..77a7150981 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/model.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -912,9 +914,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gemma/config.html b/latest/_modules/tensorrt_llm/models/gemma/config.html
index d887eb7302..7e7f0420b7 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/config.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -840,9 +842,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gemma/model.html b/latest/_modules/tensorrt_llm/models/gemma/model.html
index 38fa4c0da5..c0bcd22677 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1035,9 +1037,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gpt/config.html b/latest/_modules/tensorrt_llm/models/gpt/config.html
index b4564e9fea..408afb19c0 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/config.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -959,9 +961,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gpt/model.html b/latest/_modules/tensorrt_llm/models/gpt/model.html
index c7ccc4dcd4..80d5cb7e7b 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1062,9 +1064,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptj/config.html b/latest/_modules/tensorrt_llm/models/gptj/config.html
index 3f391136e7..af1260074d 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/config.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -688,9 +690,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptj/model.html b/latest/_modules/tensorrt_llm/models/gptj/model.html
index 1c37471537..23c9da2bfb 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -838,9 +840,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptneox/model.html b/latest/_modules/tensorrt_llm/models/gptneox/model.html
index 7f514b91c3..f10a22212d 100644
--- a/latest/_modules/tensorrt_llm/models/gptneox/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptneox/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -780,9 +782,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/llama/config.html b/latest/_modules/tensorrt_llm/models/llama/config.html
index fdf8b75fd5..35794b9a22 100644
--- a/latest/_modules/tensorrt_llm/models/llama/config.html
+++ b/latest/_modules/tensorrt_llm/models/llama/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -914,9 +916,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/llama/model.html b/latest/_modules/tensorrt_llm/models/llama/model.html
index d09be90d7a..4960edfb55 100644
--- a/latest/_modules/tensorrt_llm/models/llama/model.html
+++ b/latest/_modules/tensorrt_llm/models/llama/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1262,9 +1264,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mamba/model.html b/latest/_modules/tensorrt_llm/models/mamba/model.html
index d11931dd33..2e19b21a09 100644
--- a/latest/_modules/tensorrt_llm/models/mamba/model.html
+++ b/latest/_modules/tensorrt_llm/models/mamba/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1107,9 +1109,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/medusa/config.html b/latest/_modules/tensorrt_llm/models/medusa/config.html
index 5fcfb61ebc..8169b9e474 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/config.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -747,9 +749,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/medusa/model.html b/latest/_modules/tensorrt_llm/models/medusa/model.html
index e903cea5f6..160c44d25d 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/model.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -897,9 +899,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mllama/model.html b/latest/_modules/tensorrt_llm/models/mllama/model.html
index 6be138b113..02d061302a 100644
--- a/latest/_modules/tensorrt_llm/models/mllama/model.html
+++ b/latest/_modules/tensorrt_llm/models/mllama/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -2208,9 +2210,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
index b9d46b13f0..6ec32c5df2 100644
--- a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
+++ b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1274,9 +1276,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/modeling_utils.html b/latest/_modules/tensorrt_llm/models/modeling_utils.html
index bd2432a93e..3812b60a07 100644
--- a/latest/_modules/tensorrt_llm/models/modeling_utils.html
+++ b/latest/_modules/tensorrt_llm/models/modeling_utils.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -2683,9 +2685,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mpt/model.html b/latest/_modules/tensorrt_llm/models/mpt/model.html
index 8813fa46cb..459ef5830e 100644
--- a/latest/_modules/tensorrt_llm/models/mpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/mpt/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -812,9 +814,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
index ffda4025f2..155f15b151 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -746,9 +748,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
index 1c19643337..abb527e04b 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -814,9 +816,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/opt/model.html b/latest/_modules/tensorrt_llm/models/opt/model.html
index e10798ca21..4e3f9e15a7 100644
--- a/latest/_modules/tensorrt_llm/models/opt/model.html
+++ b/latest/_modules/tensorrt_llm/models/opt/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -817,9 +819,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/phi/model.html b/latest/_modules/tensorrt_llm/models/phi/model.html
index deecc0188d..3ca7749044 100644
--- a/latest/_modules/tensorrt_llm/models/phi/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -859,9 +861,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/phi3/model.html b/latest/_modules/tensorrt_llm/models/phi3/model.html
index ce940e2d70..05e965e296 100644
--- a/latest/_modules/tensorrt_llm/models/phi3/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi3/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -955,9 +957,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
index cbe0030d9c..b71d6ca879 100644
--- a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1260,9 +1262,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/redrafter/model.html b/latest/_modules/tensorrt_llm/models/redrafter/model.html
index 6abe0d27cd..05ecd0e886 100644
--- a/latest/_modules/tensorrt_llm/models/redrafter/model.html
+++ b/latest/_modules/tensorrt_llm/models/redrafter/model.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -947,9 +949,9 @@
diff --git a/latest/_modules/tensorrt_llm/plugin/plugin.html b/latest/_modules/tensorrt_llm/plugin/plugin.html
index 25b371b9d8..6c5c97516b 100644
--- a/latest/_modules/tensorrt_llm/plugin/plugin.html
+++ b/latest/_modules/tensorrt_llm/plugin/plugin.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1435,9 +1437,9 @@
diff --git a/latest/_modules/tensorrt_llm/quantization/mode.html b/latest/_modules/tensorrt_llm/quantization/mode.html
index 35fc7812a6..8937e5cda1 100644
--- a/latest/_modules/tensorrt_llm/quantization/mode.html
+++ b/latest/_modules/tensorrt_llm/quantization/mode.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1109,9 +1111,9 @@
diff --git a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
index cce8bf592a..63d236e56a 100644
--- a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
+++ b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1903,9 +1905,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
index 949b58db8e..81d68fc822 100644
--- a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1174,9 +1176,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/generation.html b/latest/_modules/tensorrt_llm/runtime/generation.html
index 3d57374b31..d2e389435a 100644
--- a/latest/_modules/tensorrt_llm/runtime/generation.html
+++ b/latest/_modules/tensorrt_llm/runtime/generation.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -5514,9 +5516,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
index 93d0048c58..3f91525255 100644
--- a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
+++ b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1118,9 +1120,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner.html b/latest/_modules/tensorrt_llm/runtime/model_runner.html
index 77830c02ea..23889296f5 100644
--- a/latest/_modules/tensorrt_llm/runtime/model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -991,6 +993,7 @@
prompt_table,
torch.Tensor), "Prompt table should be str or torch.Tensor"
prompt_table_data = prompt_table.to(dtype=self.dtype)
+
torch.cuda.current_stream().synchronize()
return prompt_table_data
@@ -1637,9 +1640,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
index 074e6b3161..ac227fda67 100644
--- a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -1850,9 +1852,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
index 37ef76d322..e53e09fcc9 100644
--- a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -3432,9 +3434,9 @@
diff --git a/latest/_modules/tensorrt_llm/runtime/session.html b/latest/_modules/tensorrt_llm/runtime/session.html
index aa650ba6b9..da040db4d2 100644
--- a/latest/_modules/tensorrt_llm/runtime/session.html
+++ b/latest/_modules/tensorrt_llm/runtime/session.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -978,9 +980,9 @@
diff --git a/latest/_modules/tensorrt_llm/sampling_params.html b/latest/_modules/tensorrt_llm/sampling_params.html
index 12e5d655ff..75447e5237 100644
--- a/latest/_modules/tensorrt_llm/sampling_params.html
+++ b/latest/_modules/tensorrt_llm/sampling_params.html
@@ -60,7 +60,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -353,6 +353,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+
Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -361,6 +362,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+
OpenAI Responses Client
Dynamo K8s Example
@@ -862,9 +864,13 @@
[docs]
@staticmethod
def params_imply_greedy_decoding(
-
*, temperature: Optional[float], top_p: Optional[float], top_k: Optional[int]
+
*,
+
temperature: Optional[float],
+
top_p: Optional[float],
+
top_k: Optional[int],
+
use_beam_search: bool | None,
):
-
return (
+
return (not use_beam_search) and (
(temperature is None and top_p is None and top_k is None)
or top_k == 1
or top_p == 0.0
@@ -874,10 +880,11 @@
@property
def _greedy_decoding(self) -> bool:
-
return not self.use_beam_search and self.params_imply_greedy_decoding(
+
return self.params_imply_greedy_decoding(
temperature=self.temperature,
top_p=self.top_p,
top_k=self.top_k,
+
use_beam_search=self.use_beam_search,
)
@property
@@ -1192,9 +1199,9 @@
diff --git a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
index da72ee5464..ad0e9975a1 100644
--- a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
+++ b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
@@ -30,7 +30,7 @@ In this blog, we share the configurations and procedures about how to reproduce
- [Expected Result Format](#expected-result-format-3)
- [Exploring more ISL/OSL combinations](#exploring-more-islosl-combinations)
- [WIP: Enable more features by default](#wip-enable-more-features-by-default)
- - [Not supported: MLA chunked context support on Hopper](#not-supported-mla-chunked-context-support-on-hopper)
+ - [MLA chunked context](#mla-chunked-context)
- [Out of memory issues](#out-of-memory-issues)
@@ -69,8 +69,11 @@ For NVIDIA Hopper GPUs, it's recommended to use the FP8 version of the DeepSeek
YOUR_MODEL_PATH=
cd $YOUR_MODEL_PATH
-## Download FP4 model for Blackwell GPUs
-git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4
+## Download NVFP4 model for Blackwell GPUs
+git clone https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2
+
+## Or the 0528 version
+git clone https://huggingface.co/nvidia/DeepSeek-R1-0528-NVFP4-v2
## Download FP8 model for Hopper GPUs
## FP8 model also works for Blackwell, but FP4 has the best performance on Blackwell.
@@ -248,13 +251,13 @@ To do the benchmark, run the following command:
```bash
# generate synthetic dataset
-python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
- --stdout \
- --tokenizer nvidia/DeepSeek-R1-FP4 \
+trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
+ prepare-dataset \
+ --output dataset.txt \
token-norm-dist \
--input-mean 1024 --output-mean 2048 \
--input-stdev 0 --output-stdev 0 \
- --num-requests 49152 > dataset.txt
+ --num-requests 49152
YOUR_DATA_PATH=./dataset.txt
@@ -350,13 +353,14 @@ To do the benchmark, run the following command:
```bash
# generate synthetic dataset
-python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
- --stdout \
- --tokenizer deepseek-ai/DeepSeek-R1 \
+trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
+ prepare-dataset \
+ --output dataset.txt \
token-norm-dist \
--input-mean 1024 --output-mean 2048 \
--input-stdev 0 --output-stdev 0 \
- --num-requests 5120 > dataset.txt
+ --num-requests 5120
+
YOUR_DATA_PATH=./dataset.txt
cat >./extra-llm-api-config.yml<`_ for examples in the following sections.
+After you start the server, you can send inference requests through completions API, Chat API and Responses API, which are compatible with corresponding OpenAI APIs. We use `TinyLlama-1.1B-Chat-v1.0 `_ for examples in the following sections.
Chat API
~~~~~~~~
@@ -66,6 +66,24 @@ Another example uses ``curl``:
:language: bash
:linenos:
+Responses API
+~~~~~~~~~~~~~~~
+
+You can query Responses API with any http clients, a typical example is OpenAI Python client:
+
+.. literalinclude:: ../../../../examples/serve/openai_responses_client.py
+ :language: python
+ :linenos:
+
+Another example uses ``curl``:
+
+.. literalinclude:: ../../../../examples/serve/curl_responses_client.sh
+ :language: bash
+ :linenos:
+
+
+More openai compatible examples can be found in the `compatibility examples `_ directory.
+
Multimodal Serving
~~~~~~~~~~~~~~~~~~
diff --git a/latest/_sources/deployment-guide/config_table.rst.txt b/latest/_sources/deployment-guide/config_table.rst.txt
new file mode 100644
index 0000000000..d28fed25a8
--- /dev/null
+++ b/latest/_sources/deployment-guide/config_table.rst.txt
@@ -0,0 +1,1074 @@
+.. include:: note_sections.rst
+ :start-after: .. start-note-traffic-patterns
+ :end-before: .. end-note-traffic-patterns
+
+.. start-deepseek-ai/DeepSeek-R1-0528
+
+.. _deepseek-ai/DeepSeek-R1-0528:
+
+`DeepSeek-R1 `_
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+ :width: 100%
+ :header-rows: 1
+ :widths: 12 15 15 13 20 25
+
+ * - GPU
+ - Performance Profile
+ - ISL / OSL
+ - Concurrency
+ - Config
+ - Command
+ * - 8xB200_NVL
+ - Min Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml``
+ * - 8xB200_NVL
+ - Balanced
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml``
+ * - 8xB200_NVL
+ - Min Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml``
+ * - 8xB200_NVL
+ - Balanced
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml``
+ * - 8xH200_SXM
+ - Min Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml``
+ * - 8xH200_SXM
+ - Balanced
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml``
+ * - 8xH200_SXM
+ - Max Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml``
+ * - 8xH200_SXM
+ - Min Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml``
+ * - 8xH200_SXM
+ - Balanced
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml``
+ * - 8xH200_SXM
+ - Max Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml``
+
+.. end-deepseek-ai/DeepSeek-R1-0528
+
+.. start-nvidia/DeepSeek-R1-0528-FP4-v2
+
+.. _nvidia/DeepSeek-R1-0528-FP4-v2:
+
+`DeepSeek-R1 (NVFP4) `_
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+ :width: 100%
+ :header-rows: 1
+ :widths: 12 15 15 13 20 25
+
+ * - GPU
+ - Performance Profile
+ - ISL / OSL
+ - Concurrency
+ - Config
+ - Command
+ * - 4xB200_NVL
+ - Min Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp4_conc4.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp4_conc8.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp4_conc16.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp4_conc32.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp4_conc64.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 128
+ - `1k1k_tp4_conc128.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 128
+ - `1k1k_tp8_conc128.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 256
+ - `1k1k_tp4_conc256.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 1024 / 1024
+ - 256
+ - `1k1k_tp8_conc256.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
+ * - 4xB200_NVL
+ - Min Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp4_conc4.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp4_conc8.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp4_conc16.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp4_conc32.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp4_conc64.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 128
+ - `8k1k_tp4_conc128.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 128
+ - `8k1k_tp8_conc128.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 256
+ - `8k1k_tp4_conc256.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 8192 / 1024
+ - 256
+ - `8k1k_tp8_conc256.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml``
+
+.. end-nvidia/DeepSeek-R1-0528-FP4-v2
+
+.. start-openai/gpt-oss-120b
+
+.. _openai/gpt-oss-120b:
+
+`gpt-oss-120b `_
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+ :width: 100%
+ :header-rows: 1
+ :widths: 12 15 15 13 20 25
+
+ * - GPU
+ - Performance Profile
+ - ISL / OSL
+ - Concurrency
+ - Config
+ - Command
+ * - B200_NVL
+ - Min Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp1_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp2_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp4_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
+ * - B200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp1_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp2_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp4_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
+ * - B200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp1_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp2_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp4_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
+ * - B200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp1_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml``
+ * - 2xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp2_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp4_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
+ * - B200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp1_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml``
+ * - 2xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp2_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp4_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
+ * - B200_NVL
+ - Min Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp1_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp2_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp4_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp8_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
+ * - B200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp1_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp2_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp4_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp8_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
+ * - B200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp1_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp2_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp4_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp8_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
+ * - B200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp1_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml``
+ * - 2xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp2_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp4_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp8_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
+ * - B200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp1_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml``
+ * - 2xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp2_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp4_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp8_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
+ * - B200_NVL
+ - Min Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp1_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp2_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp4_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
+ * - B200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp1_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp2_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
+ * - 4xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp4_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
+ * - 8xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
+ * - B200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp1_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml``
+ * - 2xB200_NVL
+ - Low Latency
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp2_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp4_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
+ * - B200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp1_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml``
+ * - 2xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp2_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp4_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
+ * - 8xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
+ * - B200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp1_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml``
+ * - 2xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp2_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
+ * - 4xB200_NVL
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp4_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
+ * - 8xB200_NVL
+ - Max Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
+ * - H200_SXM
+ - Min Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp1_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp2_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
+ * - 4xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp4_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 4
+ - `1k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
+ * - H200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp1_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp2_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
+ * - 4xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp4_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 8
+ - `1k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml``
+ * - H200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp1_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp2_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp4_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 16
+ - `1k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml``
+ * - H200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp1_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
+ * - 2xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp2_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp4_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 32
+ - `1k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml``
+ * - H200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp1_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
+ * - 2xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp2_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp4_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
+ * - 8xH200_SXM
+ - Max Throughput
+ - 1024 / 1024
+ - 64
+ - `1k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml``
+ * - H200_SXM
+ - Min Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp1_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp2_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
+ * - 4xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp4_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 4
+ - `1k8k_tp8_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml``
+ * - H200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp1_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp2_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
+ * - 4xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp4_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 8
+ - `1k8k_tp8_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml``
+ * - H200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp1_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp2_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp4_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 16
+ - `1k8k_tp8_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml``
+ * - H200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp1_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
+ * - 2xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp2_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp4_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 32
+ - `1k8k_tp8_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml``
+ * - H200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp1_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
+ * - 2xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp2_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp4_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
+ * - 8xH200_SXM
+ - Max Throughput
+ - 1024 / 8192
+ - 64
+ - `1k8k_tp8_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml``
+ * - H200_SXM
+ - Min Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp1_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp2_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
+ * - 4xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp4_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 4
+ - `8k1k_tp8_conc4.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml``
+ * - H200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp1_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp2_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
+ * - 4xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp4_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
+ * - 8xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 8
+ - `8k1k_tp8_conc8.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml``
+ * - H200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp1_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
+ * - 2xH200_SXM
+ - Low Latency
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp2_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp4_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 16
+ - `8k1k_tp8_conc16.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml``
+ * - H200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp1_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
+ * - 2xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp2_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp4_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
+ * - 8xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 32
+ - `8k1k_tp8_conc32.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml``
+ * - H200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp1_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
+ * - 2xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp2_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
+ * - 4xH200_SXM
+ - High Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp4_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
+ * - 8xH200_SXM
+ - Max Throughput
+ - 8192 / 1024
+ - 64
+ - `8k1k_tp8_conc64.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml``
+
+.. end-openai/gpt-oss-120b
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt
index 55deeb94fe..a887ec24b9 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt
@@ -47,7 +47,7 @@ docker run --rm -it \
-p 8000:8000 \
-v ~/.cache:/root/.cache:rw \
--name tensorrt_llm \
-nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
+nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \
/bin/bash
```
@@ -66,7 +66,7 @@ We maintain YAML configuration files with recommended performance settings in th
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -74,7 +74,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/deepseek-r1-throughput.yaml
+```{literalinclude} ../../../examples/configs/curated/deepseek-r1-throughput.yaml
---
language: shell
prepend: |
@@ -90,7 +90,7 @@ To use the `DeepGEMM` MOE backend on B200/GB200, use this config instead:
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -98,7 +98,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/deepseek-r1-deepgemm.yaml
+```{literalinclude} ../../../examples/configs/curated/deepseek-r1-deepgemm.yaml
---
language: shell
prepend: |
@@ -154,7 +154,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
#### `trust_remote_code`
- **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
+* **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
#### `kv_cache_config`
@@ -429,3 +429,23 @@ $$
$$
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
$$
+
+## Preconfigured Recipes
+
+The following tables list recommended configurations from the comprehensive database for different performance profiles.
+
+```{eval-rst}
+.. include:: note_sections.rst
+ :start-after: .. start-note-traffic-patterns
+ :end-before: .. end-note-traffic-patterns
+
+.. include:: config_table.rst
+ :start-after: .. start-deepseek-ai/DeepSeek-R1-0528
+ :end-before: .. end-deepseek-ai/DeepSeek-R1-0528
+```
+
+```{eval-rst}
+.. include:: config_table.rst
+ :start-after: .. start-nvidia/DeepSeek-R1-0528-FP4-v2
+ :end-before: .. end-nvidia/DeepSeek-R1-0528-FP4-v2
+```
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt
index ae34c5b3ce..cc30f55e98 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt
@@ -43,7 +43,7 @@ docker run --rm -it \
-p 8000:8000 \
-v ~/.cache:/root/.cache:rw \
--name tensorrt_llm \
-nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
+nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \
/bin/bash
```
@@ -64,7 +64,7 @@ For low-latency use cases:
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -72,7 +72,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/gpt-oss-120b-latency.yaml
+```{literalinclude} ../../../examples/configs/curated/gpt-oss-120b-latency.yaml
---
language: shell
prepend: |
@@ -88,7 +88,7 @@ For max-throughput use cases:
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -96,7 +96,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/gpt-oss-120b-throughput.yaml
+```{literalinclude} ../../../examples/configs/curated/gpt-oss-120b-throughput.yaml
---
language: shell
prepend: |
@@ -377,3 +377,17 @@ $$
$$
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
$$
+
+## Preconfigured Recipes
+
+The following table lists recommended configurations from the comprehensive database for different performance profiles.
+
+```{eval-rst}
+.. include:: note_sections.rst
+ :start-after: .. start-note-traffic-patterns
+ :end-before: .. end-note-traffic-patterns
+
+.. include:: config_table.rst
+ :start-after: .. start-openai/gpt-oss-120b
+ :end-before: .. end-openai/gpt-oss-120b
+```
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt
index d8ec17daff..391a72091d 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt
@@ -306,3 +306,18 @@ Run `bench.sh` to begin a serving benchmark.
```shell
./bench.sh
```
+
+## Troubleshooting
+
+Since Kimi K2 Thinking has larger weight size than other models, it's possible seeing host OOM issues, as the following:
+
+```log
+Loading weights: 100%|█████████████████████| 1408/1408 [03:43<00:00, 6.30it/s]
+ 0: [12/04/2025-18:38:28] [TRT-LLM] [RANK 0] [I] moe_load_balancer finalizing model...
+ 1: [nvl72136-T14:452151:0:452151] Caught signal 7 (Bus error: nonexistent physical address)
+ 1: ==== backtrace (tid: 452151) ====
+ 1: 0 /usr/local/ucx//lib/libucs.so.0(ucs_handle_error+0x2cc) [0xffff9638274c]
+ 1: 1 /usr/local/ucx//lib/libucs.so.0(+0x328fc) [0xffff963828fc]
+ 1: 2 /usr/local/ucx//lib/libucs.so.0(+0x32c78) [0xffff96382c78]
+```
+This can be addressed by mounting `tmpfs:/dev/shm:size=640G` when launching the Docker container, to increase the shm size that the container can access.
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt
index d227b2f440..b45b7d2ffa 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt
@@ -39,7 +39,7 @@ docker run --rm -it \
-p 8000:8000 \
-v ~/.cache:/root/.cache:rw \
--name tensorrt_llm \
-nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
+nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \
/bin/bash
```
@@ -58,7 +58,7 @@ We maintain YAML configuration files with recommended performance settings in th
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -66,7 +66,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/llama-3.3-70b.yaml
+```{literalinclude} ../../../examples/configs/curated/llama-3.3-70b.yaml
---
language: shell
prepend: |
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt
index 509a5cf00f..3e70209b21 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt
@@ -38,7 +38,7 @@ docker run --rm -it \
-p 8000:8000 \
-v ~/.cache:/root/.cache:rw \
--name tensorrt_llm \
-nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
+nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \
/bin/bash
```
@@ -57,7 +57,7 @@ We maintain YAML configuration files with recommended performance settings in th
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -65,7 +65,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/llama-4-scout.yaml
+```{literalinclude} ../../../examples/configs/curated/llama-4-scout.yaml
---
language: shell
prepend: |
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt
index 246fc74a56..46bf724b71 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt
@@ -35,7 +35,7 @@ We maintain YAML configuration files with recommended performance settings in th
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3-next.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -43,7 +43,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/qwen3-next.yaml
+```{literalinclude} ../../../examples/configs/curated/qwen3-next.yaml
---
language: shell
prepend: |
diff --git a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt
index 190740ebd8..894c6a1e63 100644
--- a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt
@@ -40,7 +40,7 @@ We maintain YAML configuration files with recommended performance settings in th
```shell
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
-EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3.yaml
+EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml
```
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
@@ -48,7 +48,7 @@ Note: if you don't have access to the source code locally, you can manually crea
````{admonition} Show code
:class: dropdown
-```{literalinclude} ../../../examples/configs/qwen3.yaml
+```{literalinclude} ../../../examples/configs/curated/qwen3.yaml
---
language: shell
prepend: |
diff --git a/latest/_sources/deployment-guide/index.rst.txt b/latest/_sources/deployment-guide/index.rst.txt
index ed7fd9c536..644a9d9ae9 100644
--- a/latest/_sources/deployment-guide/index.rst.txt
+++ b/latest/_sources/deployment-guide/index.rst.txt
@@ -6,15 +6,20 @@ Quick Start for Popular Models
The table below contains ``trtllm-serve`` commands that can be used to easily deploy popular models including DeepSeek-R1, gpt-oss, Llama 4, Qwen3, and more.
-We maintain LLM API configuration files for these models containing recommended performance settings in the `examples/configs `_ directory. The TensorRT LLM Docker container makes the config files available at ``/app/tensorrt_llm/examples/configs``, but you can customize this as needed:
+We maintain LLM API configuration files for these models containing recommended performance settings in two locations:
+
+* **Curated Examples**: `examples/configs/curated `_ - Hand-picked configurations for common scenarios.
+* **Comprehensive Database**: `examples/configs/database `_ - A more comprehensive set of known-good configurations for various GPUs and traffic patterns.
+
+The TensorRT LLM Docker container makes these config files available at ``/app/tensorrt_llm/examples/configs/curated`` and ``/app/tensorrt_llm/examples/configs/database`` respectively. You can reference them as needed:
.. code-block:: bash
export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment
-.. note::
-
- The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, you may benefit from additional tuning. In the future, we plan to provide more configs for a wider range of traffic patterns.
+.. include:: note_sections.rst
+ :start-after: .. start-note-quick-start-isl-osl
+ :end-before: .. end-note-quick-start-isl-osl
This table is designed to provide a straightforward starting point; for detailed model-specific deployment guides, check out the guides below.
@@ -30,53 +35,53 @@ This table is designed to provide a straightforward starting point; for detailed
* - `DeepSeek-R1 `_
- H100, H200
- Max Throughput
- - `deepseek-r1-throughput.yaml `_
- - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml``
+ - `deepseek-r1-throughput.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
* - `DeepSeek-R1 `_
- B200, GB200
- Max Throughput
- - `deepseek-r1-deepgemm.yaml `_
- - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml``
+ - `deepseek-r1-deepgemm.yaml `_
+ - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml``
* - `DeepSeek-R1 (NVFP4) `_
- B200, GB200
- Max Throughput
- - `deepseek-r1-throughput.yaml `_
- - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml``
+ - `deepseek-r1-throughput.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
* - `DeepSeek-R1 (NVFP4) `_
- B200, GB200
- Min Latency
- - `deepseek-r1-latency.yaml `_
- - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-latency.yaml``
+ - `deepseek-r1-latency.yaml `_
+ - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml``
* - `gpt-oss-120b `_
- Any
- Max Throughput
- - `gpt-oss-120b-throughput.yaml `_
- - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml``
+ - `gpt-oss-120b-throughput.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml``
* - `gpt-oss-120b `_
- Any
- Min Latency
- - `gpt-oss-120b-latency.yaml `_
- - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml``
+ - `gpt-oss-120b-latency.yaml `_
+ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml``
* - `Qwen3-Next-80B-A3B-Thinking `_
- Any
- Max Throughput
- - `qwen3-next.yaml `_
- - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/qwen3-next.yaml``
+ - `qwen3-next.yaml `_
+ - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml``
* - Qwen3 family (e.g. `Qwen3-30B-A3B `_)
- Any
- Max Throughput
- - `qwen3.yaml `_
- - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/qwen3.yaml`` (swap to another Qwen3 model name as needed)
+ - `qwen3.yaml `_
+ - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed)
* - `Llama-3.3-70B (FP8) `_
- Any
- Max Throughput
- - `llama-3.3-70b.yaml `_
- - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml``
+ - `llama-3.3-70b.yaml `_
+ - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml``
* - `Llama 4 Scout (FP8) `_
- Any
- Max Throughput
- - `llama-4-scout.yaml `_
- - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml``
+ - `llama-4-scout.yaml `_
+ - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml``
Model-Specific Deployment Guides
---------------------------------
@@ -94,3 +99,10 @@ The deployment guides below provide more detailed instructions for serving speci
deployment-guide-for-qwen3-on-trtllm.md
deployment-guide-for-qwen3-next-on-trtllm.md
deployment-guide-for-kimi-k2-thinking-on-trtllm.md
+
+Comprehensive Configuration Database
+------------------------------------
+
+The table below lists all available pre-configured model scenarios in the TensorRT LLM configuration database. Each row represents a specific model, GPU, and performance profile combination with recommended request settings.
+
+.. include:: config_table.rst
diff --git a/latest/_sources/deployment-guide/note_sections.rst.txt b/latest/_sources/deployment-guide/note_sections.rst.txt
new file mode 100644
index 0000000000..4cd0d1c41d
--- /dev/null
+++ b/latest/_sources/deployment-guide/note_sections.rst.txt
@@ -0,0 +1,36 @@
+..
+ Reusable note sections for deployment guides.
+ Include specific notes using:
+
+ .. include:: note_sections.rst
+ :start-after: .. start-note-
+ :end-before: .. end-note-
+
+.. start-note-traffic-patterns
+
+.. note::
+
+ **Traffic Patterns**: The ISL (Input Sequence Length) and OSL (Output Sequence Length)
+ values in each configuration represent the **maximum supported values** for that config.
+ Requests exceeding these limits may result in errors.
+
+ To handle requests with input sequences **longer than the configured ISL**, add the following
+ to your config file:
+
+ .. code-block:: yaml
+
+ enable_chunked_prefill: true
+
+ This enables chunked prefill, which processes long input sequences in chunks rather than
+ requiring them to fit within a single prefill operation. Note that enabling chunked prefill
+ does **not** guarantee optimal performance—these configs are tuned for the specified ISL/OSL.
+
+.. end-note-traffic-patterns
+
+.. start-note-quick-start-isl-osl
+
+.. note::
+
+ The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Comprehensive Configuration Database` section below which covers a larger set of traffic patterns and performance profiles.
+
+.. end-note-quick-start-isl-osl
diff --git a/latest/_sources/developer-guide/perf-analysis.md.txt b/latest/_sources/developer-guide/perf-analysis.md.txt
index 3ac01d82ed..4aa26ecbda 100644
--- a/latest/_sources/developer-guide/perf-analysis.md.txt
+++ b/latest/_sources/developer-guide/perf-analysis.md.txt
@@ -72,10 +72,12 @@ Say we want to profile iterations 100 to 150 on a `trtllm-bench`/`trtllm-serve`
#!/bin/bash
# Prepare dataset for the benchmark
-python3 benchmarks/cpp/prepare_dataset.py \
- --tokenizer=${MODEL_PATH} \
- --stdout token-norm-dist --num-requests=${NUM_SAMPLES} \
- --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
+trtllm-bench --model ${MODEL_PATH} \
+ prepare-dataset \
+ --output dataset.txt \
+ token-norm-dist \
+ --num-requests=${NUM_SAMPLES} \
+ --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0
# Benchmark and profile
TLLM_PROFILE_START_STOP=100-150 nsys profile \
diff --git a/latest/_sources/developer-guide/perf-benchmarking.md.txt b/latest/_sources/developer-guide/perf-benchmarking.md.txt
index 4e4e3ca421..63bd9f6f8f 100644
--- a/latest/_sources/developer-guide/perf-benchmarking.md.txt
+++ b/latest/_sources/developer-guide/perf-benchmarking.md.txt
@@ -152,7 +152,7 @@ directory. For example, to generate a synthetic dataset of 1000 requests with a
128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run:
```shell
-python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt
+trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000
```
### Running with the PyTorch Workflow
@@ -233,13 +233,13 @@ The PyTorch workflow supports benchmarking with LoRA (Low-Rank Adaptation) adapt
**Preparing LoRA Dataset**
-Use `prepare_dataset.py` with LoRA-specific options to generate requests with LoRA metadata:
+Use `trtllm-bench prepare-dataset` with LoRA-specific options to generate requests with LoRA metadata:
```shell
-python3 benchmarks/cpp/prepare_dataset.py \
- --stdout \
+trtllm-bench \
+ --model /path/to/tokenizer \
+ prepare-dataset \
--rand-task-id 0 1 \
- --tokenizer /path/to/tokenizer \
--lora-dir /path/to/loras \
token-norm-dist \
--num-requests 100 \
@@ -310,17 +310,18 @@ Each subdirectory should contain the LoRA adapter files for that specific task.
To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above.
First, prepare the dataset:
-```python
-python ./benchmarks/cpp/prepare_dataset.py \
- --tokenizer Qwen/Qwen2-VL-2B-Instruct \
- --stdout \
- dataset \
+```bash
+trtllm-bench \
+ --model Qwen/Qwen2-VL-2B-Instruct \
+ prepare-dataset \
+ --output mm_data.jsonl
+ real-dataset
--dataset-name lmms-lab/MMMU \
--dataset-split test \
--dataset-image-key image \
--dataset-prompt-key question \
--num-requests 10 \
- --output-len-dist 128,5 > mm_data.jsonl
+ --output-len-dist 128,5
```
It will download the media files to `/tmp` directory and prepare the dataset with their paths. Note that the `prompt` fields are texts and not tokenized ids. This is due to the fact that
the `prompt` and the media (image/video) are processed by a preprocessor for multimodal files.
@@ -423,10 +424,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp
- [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
- [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
-To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html).
+To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html).
`trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
above:
diff --git a/latest/_sources/developer-guide/perf-overview.md.txt b/latest/_sources/developer-guide/perf-overview.md.txt
index 0a144a58d4..aefa91fd43 100644
--- a/latest/_sources/developer-guide/perf-overview.md.txt
+++ b/latest/_sources/developer-guide/perf-overview.md.txt
@@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To
The performance numbers below were collected using the steps described in this document.
-Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
+Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
*(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:*
diff --git a/latest/_sources/examples/curl_chat_client.rst.txt b/latest/_sources/examples/curl_chat_client.rst.txt
index d3709ccd9c..f5a6ef236b 100644
--- a/latest/_sources/examples/curl_chat_client.rst.txt
+++ b/latest/_sources/examples/curl_chat_client.rst.txt
@@ -2,7 +2,7 @@ Curl Chat Client
================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_chat_client.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_chat_client.sh.
.. literalinclude:: ../../../examples/serve/curl_chat_client.sh
:lines: 1-11
diff --git a/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt b/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt
index 73760884c2..17e6340f42 100644
--- a/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt
+++ b/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt
@@ -2,7 +2,7 @@ Curl Chat Client For Multimodal
===============================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_chat_client_for_multimodal.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_chat_client_for_multimodal.sh.
.. literalinclude:: ../../../examples/serve/curl_chat_client_for_multimodal.sh
:lines: 1-88
diff --git a/latest/_sources/examples/curl_completion_client.rst.txt b/latest/_sources/examples/curl_completion_client.rst.txt
index c2f4e9a14e..b4ef6aa5d3 100644
--- a/latest/_sources/examples/curl_completion_client.rst.txt
+++ b/latest/_sources/examples/curl_completion_client.rst.txt
@@ -2,7 +2,7 @@ Curl Completion Client
======================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_completion_client.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_completion_client.sh.
.. literalinclude:: ../../../examples/serve/curl_completion_client.sh
:lines: 1-10
diff --git a/latest/_sources/examples/curl_responses_client.rst.txt b/latest/_sources/examples/curl_responses_client.rst.txt
new file mode 100644
index 0000000000..bcb3bcd62b
--- /dev/null
+++ b/latest/_sources/examples/curl_responses_client.rst.txt
@@ -0,0 +1,10 @@
+Curl Responses Client
+=====================
+Refer to the `trtllm-serve documentation `_ for starting a server.
+
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_responses_client.sh.
+
+.. literalinclude:: ../../../examples/serve/curl_responses_client.sh
+ :lines: 1-9
+ :language: bash
+ :linenos:
diff --git a/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt b/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt
index 4e0a039fe1..4121dcc52f 100644
--- a/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt
+++ b/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt
@@ -2,7 +2,7 @@ Deepseek R1 Reasoning Parser
============================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/deepseek_r1_reasoning_parser.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/deepseek_r1_reasoning_parser.sh.
.. literalinclude:: ../../../examples/serve/deepseek_r1_reasoning_parser.sh
:lines: 1-23
diff --git a/latest/_sources/examples/genai_perf_client.rst.txt b/latest/_sources/examples/genai_perf_client.rst.txt
index 4f222352aa..9bb9012949 100644
--- a/latest/_sources/examples/genai_perf_client.rst.txt
+++ b/latest/_sources/examples/genai_perf_client.rst.txt
@@ -2,7 +2,7 @@ Genai Perf Client
=================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/genai_perf_client.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/genai_perf_client.sh.
.. literalinclude:: ../../../examples/serve/genai_perf_client.sh
:lines: 1-16
diff --git a/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt b/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt
index 6ae821dace..aa6f66eace 100644
--- a/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt
+++ b/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt
@@ -2,7 +2,7 @@ Genai Perf Client For Multimodal
================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/genai_perf_client_for_multimodal.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/genai_perf_client_for_multimodal.sh.
.. literalinclude:: ../../../examples/serve/genai_perf_client_for_multimodal.sh
:lines: 1-19
diff --git a/latest/_sources/examples/llm_guided_decoding.rst.txt b/latest/_sources/examples/llm_guided_decoding.rst.txt
index c7a50512da..c1c9622871 100644
--- a/latest/_sources/examples/llm_guided_decoding.rst.txt
+++ b/latest/_sources/examples/llm_guided_decoding.rst.txt
@@ -1,6 +1,6 @@
Generate text with guided decoding
==================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_guided_decoding.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_guided_decoding.py.
.. literalinclude:: ../../../examples/llm-api/llm_guided_decoding.py
:lines: 4-47
diff --git a/latest/_sources/examples/llm_inference.rst.txt b/latest/_sources/examples/llm_inference.rst.txt
index be80e456eb..a0379d8bf0 100644
--- a/latest/_sources/examples/llm_inference.rst.txt
+++ b/latest/_sources/examples/llm_inference.rst.txt
@@ -1,6 +1,6 @@
Generate text
=============
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference.py
:lines: 4-35
diff --git a/latest/_sources/examples/llm_inference_async.rst.txt b/latest/_sources/examples/llm_inference_async.rst.txt
index f7ff40a646..3da36720c2 100644
--- a/latest/_sources/examples/llm_inference_async.rst.txt
+++ b/latest/_sources/examples/llm_inference_async.rst.txt
@@ -1,6 +1,6 @@
Generate text asynchronously
============================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference_async.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference_async.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference_async.py
:lines: 4-43
diff --git a/latest/_sources/examples/llm_inference_async_streaming.rst.txt b/latest/_sources/examples/llm_inference_async_streaming.rst.txt
index 0736586f2f..5d4711e145 100644
--- a/latest/_sources/examples/llm_inference_async_streaming.rst.txt
+++ b/latest/_sources/examples/llm_inference_async_streaming.rst.txt
@@ -1,6 +1,6 @@
Generate text in streaming
==========================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference_async_streaming.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference_async_streaming.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference_async_streaming.py
:lines: 4-64
diff --git a/latest/_sources/examples/llm_inference_distributed.rst.txt b/latest/_sources/examples/llm_inference_distributed.rst.txt
index a04aa99313..07cc8963df 100644
--- a/latest/_sources/examples/llm_inference_distributed.rst.txt
+++ b/latest/_sources/examples/llm_inference_distributed.rst.txt
@@ -1,6 +1,6 @@
Distributed LLM Generation
==========================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference_distributed.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference_distributed.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference_distributed.py
:lines: 4-44
diff --git a/latest/_sources/examples/llm_kv_cache_connector.rst.txt b/latest/_sources/examples/llm_kv_cache_connector.rst.txt
index 0a150c4a36..32b443ae33 100644
--- a/latest/_sources/examples/llm_kv_cache_connector.rst.txt
+++ b/latest/_sources/examples/llm_kv_cache_connector.rst.txt
@@ -1,6 +1,6 @@
KV Cache Connector
==================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_kv_cache_connector.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_kv_cache_connector.py.
.. literalinclude:: ../../../examples/llm-api/llm_kv_cache_connector.py
:lines: 4-326
diff --git a/latest/_sources/examples/llm_kv_cache_offloading.rst.txt b/latest/_sources/examples/llm_kv_cache_offloading.rst.txt
index a64445a962..5ae7bb74b1 100644
--- a/latest/_sources/examples/llm_kv_cache_offloading.rst.txt
+++ b/latest/_sources/examples/llm_kv_cache_offloading.rst.txt
@@ -1,6 +1,6 @@
KV Cache Offloading
===================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_kv_cache_offloading.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_kv_cache_offloading.py.
.. literalinclude:: ../../../examples/llm-api/llm_kv_cache_offloading.py
:lines: 4-134
diff --git a/latest/_sources/examples/llm_logits_processor.rst.txt b/latest/_sources/examples/llm_logits_processor.rst.txt
index b739b44ca9..e2c401f98b 100644
--- a/latest/_sources/examples/llm_logits_processor.rst.txt
+++ b/latest/_sources/examples/llm_logits_processor.rst.txt
@@ -1,6 +1,6 @@
Control generated text using logits processor
=============================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_logits_processor.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_logits_processor.py.
.. literalinclude:: ../../../examples/llm-api/llm_logits_processor.py
:lines: 4-128
diff --git a/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt b/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt
index 0a84a19a28..fbaaae9489 100644
--- a/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt
+++ b/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt
@@ -1,6 +1,6 @@
Run LLM-API with pytorch backend on Slurm
=========================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_mgmn_llm_distributed.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_mgmn_llm_distributed.sh.
.. literalinclude:: ../../../examples/llm-api/llm_mgmn_llm_distributed.sh
:lines: 1-48,52-94
diff --git a/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt b/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt
index ddfa9f47ca..bb9f5bfdb7 100644
--- a/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt
+++ b/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt
@@ -1,8 +1,8 @@
Run trtllm-bench with pytorch backend on Slurm
==============================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_mgmn_trtllm_bench.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_mgmn_trtllm_bench.sh.
.. literalinclude:: ../../../examples/llm-api/llm_mgmn_trtllm_bench.sh
- :lines: 1-46,50-131
+ :lines: 1-46,50-130
:language: bash
:linenos:
diff --git a/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt b/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt
index 18e6c10c8c..d3ebb95460 100644
--- a/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt
+++ b/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt
@@ -1,6 +1,6 @@
Run trtllm-serve with pytorch backend on Slurm
==============================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_mgmn_trtllm_serve.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_mgmn_trtllm_serve.sh.
.. literalinclude:: ../../../examples/llm-api/llm_mgmn_trtllm_serve.sh
:lines: 1-46,50-92
diff --git a/latest/_sources/examples/llm_multilora.rst.txt b/latest/_sources/examples/llm_multilora.rst.txt
index b0f9fdf5ec..5a4ef4786d 100644
--- a/latest/_sources/examples/llm_multilora.rst.txt
+++ b/latest/_sources/examples/llm_multilora.rst.txt
@@ -1,6 +1,6 @@
Generate text with multiple LoRA adapters
=========================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_multilora.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_multilora.py.
.. literalinclude:: ../../../examples/llm-api/llm_multilora.py
:lines: 4-89
diff --git a/latest/_sources/examples/llm_runtime.rst.txt b/latest/_sources/examples/llm_runtime.rst.txt
index c7405bcbe5..b5c67ea9d7 100644
--- a/latest/_sources/examples/llm_runtime.rst.txt
+++ b/latest/_sources/examples/llm_runtime.rst.txt
@@ -1,6 +1,6 @@
Runtime Configuration Examples
==============================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_runtime.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_runtime.py.
.. literalinclude:: ../../../examples/llm-api/llm_runtime.py
:lines: 4-144
diff --git a/latest/_sources/examples/llm_sampling.rst.txt b/latest/_sources/examples/llm_sampling.rst.txt
index bc4c60a7ce..050450c330 100644
--- a/latest/_sources/examples/llm_sampling.rst.txt
+++ b/latest/_sources/examples/llm_sampling.rst.txt
@@ -1,6 +1,6 @@
Sampling Techniques Showcase
============================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_sampling.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_sampling.py.
.. literalinclude:: ../../../examples/llm-api/llm_sampling.py
:lines: 4-248
diff --git a/latest/_sources/examples/llm_sparse_attention.rst.txt b/latest/_sources/examples/llm_sparse_attention.rst.txt
index 1c398bb1f0..c13f175d1e 100644
--- a/latest/_sources/examples/llm_sparse_attention.rst.txt
+++ b/latest/_sources/examples/llm_sparse_attention.rst.txt
@@ -1,6 +1,6 @@
Sparse Attention
================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_sparse_attention.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_sparse_attention.py.
.. literalinclude:: ../../../examples/llm-api/llm_sparse_attention.py
:lines: 4-229
diff --git a/latest/_sources/examples/llm_speculative_decoding.rst.txt b/latest/_sources/examples/llm_speculative_decoding.rst.txt
index 689d6af530..dbfca2fb58 100644
--- a/latest/_sources/examples/llm_speculative_decoding.rst.txt
+++ b/latest/_sources/examples/llm_speculative_decoding.rst.txt
@@ -1,6 +1,6 @@
Speculative Decoding
====================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_speculative_decoding.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_speculative_decoding.py.
.. literalinclude:: ../../../examples/llm-api/llm_speculative_decoding.py
:lines: 4-95
diff --git a/latest/_sources/examples/openai_chat_client.rst.txt b/latest/_sources/examples/openai_chat_client.rst.txt
index 29cf974ab0..bc25fbfefb 100644
--- a/latest/_sources/examples/openai_chat_client.rst.txt
+++ b/latest/_sources/examples/openai_chat_client.rst.txt
@@ -2,7 +2,7 @@ OpenAI Chat Client
==================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_chat_client.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_chat_client.py.
.. literalinclude:: ../../../examples/serve/openai_chat_client.py
:lines: 2-21
diff --git a/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt b/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt
index b3fb0a07bc..9eb49504d9 100644
--- a/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt
+++ b/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt
@@ -2,7 +2,7 @@ OpenAI Chat Client for Multimodal
=================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_chat_client_for_multimodal.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_chat_client_for_multimodal.py.
.. literalinclude:: ../../../examples/serve/openai_chat_client_for_multimodal.py
:lines: 2-129
diff --git a/latest/_sources/examples/openai_completion_client.rst.txt b/latest/_sources/examples/openai_completion_client.rst.txt
index 7b60afc04d..54a9fac182 100644
--- a/latest/_sources/examples/openai_completion_client.rst.txt
+++ b/latest/_sources/examples/openai_completion_client.rst.txt
@@ -2,7 +2,7 @@ OpenAI Completion Client
========================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_completion_client.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_completion_client.py.
.. literalinclude:: ../../../examples/serve/openai_completion_client.py
:lines: 2-15
diff --git a/latest/_sources/examples/openai_completion_client_for_lora.rst.txt b/latest/_sources/examples/openai_completion_client_for_lora.rst.txt
index 4eabf04fea..121ff107e2 100644
--- a/latest/_sources/examples/openai_completion_client_for_lora.rst.txt
+++ b/latest/_sources/examples/openai_completion_client_for_lora.rst.txt
@@ -2,7 +2,7 @@ Openai Completion Client For Lora
=================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_completion_client_for_lora.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_completion_client_for_lora.py.
.. literalinclude:: ../../../examples/serve/openai_completion_client_for_lora.py
:lines: 1-30
diff --git a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
index 8ed397f1cd..1eee39507d 100644
--- a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
+++ b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
@@ -2,7 +2,7 @@ OpenAI Completion Client with JSON Schema
=========================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_completion_client_json_schema.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_completion_client_json_schema.py.
.. literalinclude:: ../../../examples/serve/openai_completion_client_json_schema.py
:lines: 2-52
diff --git a/latest/_sources/examples/openai_responses_client.rst.txt b/latest/_sources/examples/openai_responses_client.rst.txt
new file mode 100644
index 0000000000..f8b4c62bc5
--- /dev/null
+++ b/latest/_sources/examples/openai_responses_client.rst.txt
@@ -0,0 +1,10 @@
+OpenAI Responses Client
+=======================
+Refer to the `trtllm-serve documentation `_ for starting a server.
+
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_responses_client.py.
+
+.. literalinclude:: ../../../examples/serve/openai_responses_client.py
+ :lines: 2-15
+ :language: python
+ :linenos:
diff --git a/latest/_sources/examples/trtllm_serve_examples.rst.txt b/latest/_sources/examples/trtllm_serve_examples.rst.txt
index f39dfcee67..e61fd0e9ff 100644
--- a/latest/_sources/examples/trtllm_serve_examples.rst.txt
+++ b/latest/_sources/examples/trtllm_serve_examples.rst.txt
@@ -10,6 +10,7 @@ Online Serving Examples
curl_chat_client
curl_chat_client_for_multimodal
curl_completion_client
+ curl_responses_client
deepseek_r1_reasoning_parser
genai_perf_client
genai_perf_client_for_multimodal
@@ -18,4 +19,5 @@ Online Serving Examples
openai_completion_client
openai_completion_client_for_lora
openai_completion_client_json_schema
+ openai_responses_client
diff --git a/latest/_sources/features/auto_deploy/support_matrix.md.txt b/latest/_sources/features/auto_deploy/support_matrix.md.txt
index 26c07b308b..fec6d841af 100644
--- a/latest/_sources/features/auto_deploy/support_matrix.md.txt
+++ b/latest/_sources/features/auto_deploy/support_matrix.md.txt
@@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations:
### Precision Support
-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
**Supported precision types include:**
diff --git a/latest/_sources/features/quantization.md.txt b/latest/_sources/features/quantization.md.txt
index 8a0e160529..7998f1c03a 100644
--- a/latest/_sources/features/quantization.md.txt
+++ b/latest/_sources/features/quantization.md.txt
@@ -11,6 +11,7 @@ TensorRT LLM offers a variety of quantization recipes to optimize LLM inference.
* FP8 Block Scaling
* FP8 Rowwise
* FP8 KV Cache
+* NVFP4 KV Cache
* W4A16 GPTQ
* W4A8 GPTQ
* W4A16 AWQ
@@ -23,7 +24,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac
### Running Pre-quantized Models
-TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
```python
from tensorrt_llm import LLM
@@ -47,6 +48,20 @@ llm = LLM(model='/path/to/model',
llm.generate("Hello, my name is")
```
+#### NVFP4 KV Cache
+
+To enable NVFP4 KV cache, offline quantization with ModelOpt is required. Please follow the below section for instructions.
+After the quantization is done, the NVFP4 KV cache option can be set by:
+
+```python
+from tensorrt_llm import LLM
+from tensorrt_llm.llmapi import KvCacheConfig
+llm = LLM(model='/path/to/model',
+ kv_cache_config=KvCacheConfig(dtype='nvfp4'))
+llm.generate("Hello, my name is")
+```
+
+
### Offline Quantization with ModelOpt
If a pre-quantized model is not available on the [Hugging Face Hub](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4), you can quantize it offline using ModelOpt.
@@ -54,35 +69,47 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg
Follow this step-by-step guide to quantize a model:
```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
-scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
+scripts/huggingface_example.sh --model --quant fp8
```
+#### NVFP4 KV Cache
+
+To generate the checkpoint for NVFP4 KV cache:
+
+```bash
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd TensorRT-Model-Optimizer/examples/llm_ptq
+scripts/huggingface_example.sh --model --quant fp8 --kv_cache_quant nvfp4
+```
+
+Note that currently TRT-LLM only supports FP8 weight/activation quantization when NVFP4 KV cache is enabled. Therefore, `--quant fp8` is required here.
+
## Model Supported Matrix
-| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ |
-| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: |
-| BERT | . | . | . | . | . | Y | . | . | . | . |
-| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . |
-| EXAONE | . | . | Y | . | . | Y | Y | Y | . | . |
-| Gemma 3 | . | . | Y | . | . | Y | Y | Y | . | . |
-| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . |
-| LLaMA | Y | . | Y | . | . | Y | . | Y | . | Y |
-| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | . | Y |
-| LLaMA 3 | . | . | . | . | Y | Y | Y | . | . | . |
-| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . |
-| Mistral | . | . | Y | . | . | Y | . | Y | . | . |
-| Mixtral | Y | . | Y | . | . | Y | . | . | . | . |
-| Phi | . | . | . | . | . | Y | Y | . | . | . |
-| Qwen | . | . | . | . | . | Y | Y | Y | . | Y |
-| Qwen-2/2.5 | Y | . | Y | . | . | Y | Y | Y | . | Y |
-| Qwen-3 | Y | . | Y | . | . | Y | . | Y | . | Y |
-| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . |
-| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . |
-| LLaVA | . | . | Y | . | . | Y | . | Y | . | Y |
-| VILA | . | . | Y | . | . | Y | . | Y | . | Y |
-| Nougat | . | . | . | . | . | Y | . | . | . | . |
+| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache | NVFP4 KV Cache | W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ |
+| :------------- | :---: | :---: | :---: | :---: | :---: | :---: |:---:| :-------: | :-------: | :--------: | :--------: |
+| BERT | . | . | . | . | . | Y | . | . | . | . | . |
+| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . | . |
+| EXAONE | . | . | Y | . | . | Y | . | Y | Y | . | . |
+| Gemma 3 | . | . | Y | . | . | Y | . | Y | Y | . | . |
+| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . | . |
+| LLaMA | Y | . | Y | . | . | Y | . | . | Y | . | Y |
+| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | Y | . | Y |
+| LLaMA 3 | . | . | . | . | Y | Y | Y | Y | . | . | . |
+| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . | . |
+| Mistral | . | . | Y | . | . | Y | . | . | Y | . | . |
+| Mixtral | Y | . | Y | . | . | Y | . | . | . | . | . |
+| Phi | . | . | . | . | . | Y | . | Y | . | . | . |
+| Qwen | . | . | . | . | . | Y | . | Y | Y | . | Y |
+| Qwen-2/2.5 | Y | . | Y | . | . | Y | . | Y | Y | . | Y |
+| Qwen-3 | Y | . | Y | . | . | Y | Y | . | Y | . | Y |
+| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . | . |
+| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . | . |
+| LLaVA | . | . | Y | . | . | Y | . | . | Y | . | Y |
+| VILA | . | . | Y | . | . | Y | . | . | Y | . | Y |
+| Nougat | . | . | . | . | . | Y | . | . | . | . | . |
```{note}
@@ -93,13 +120,13 @@ The language component decides which quantization methods are supported by a giv
## Hardware Support Matrix
-| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ |
-| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: |
-| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . |
-| Blackwell(sm100) | Y | Y | Y | Y | . | Y | . | . | . | . |
-| Hopper | . | . | Y | Y | Y | Y | Y | Y | Y | Y |
-| Ada Lovelace | . | . | Y | . | . | Y | Y | Y | Y | Y |
-| Ampere | . | . | . | . | . | Y | . | Y | . | Y |
+| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache | NVFP4 KV Cache | W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ |
+| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: |
+| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . | . |
+| Blackwell(sm100) | Y | Y | Y | Y | . | Y | Y | . | . | . | . |
+| Hopper | . | . | Y | Y | Y | Y | . | Y | Y | Y | Y |
+| Ada Lovelace | . | . | Y | . | . | Y | . | Y | Y | Y | Y |
+| Ampere | . | . | . | . | . | Y | . | . | Y | . | Y |
```{note}
FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/weight and UE8M0 act/weight scale), which is slightly different from SM90 FP8 recipe (E4M3 act/weight and FP32 act/weight scale).
```
@@ -108,4 +135,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w
## Quick Links
- [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4)
-- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html)
+- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html)
diff --git a/latest/_sources/legacy/performance/perf-analysis.md.txt b/latest/_sources/legacy/performance/perf-analysis.md.txt
index f72437f4e9..51abd6460d 100644
--- a/latest/_sources/legacy/performance/perf-analysis.md.txt
+++ b/latest/_sources/legacy/performance/perf-analysis.md.txt
@@ -66,10 +66,10 @@ Say we want to profile iterations 100 to 150 on a trtllm-bench/trtllm-serve run,
#!/bin/bash
# Prepare dataset for the benchmark
-python3 benchmarks/cpp/prepare_dataset.py \
- --tokenizer=${MODEL_PATH} \
- --stdout token-norm-dist --num-requests=${NUM_SAMPLES} \
- --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
+trtllm-bench \
+ --model=${MODEL_PATH} prepare-dataset \
+ --output /tmp/dataset.txt token-norm-dist --num-requests=${NUM_SAMPLES} \
+ --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0
# Benchmark and profile
TLLM_PROFILE_START_STOP=100-150 nsys profile \
diff --git a/latest/_sources/legacy/performance/perf-benchmarking.md.txt b/latest/_sources/legacy/performance/perf-benchmarking.md.txt
index 55caef07ba..9530b6da1b 100644
--- a/latest/_sources/legacy/performance/perf-benchmarking.md.txt
+++ b/latest/_sources/legacy/performance/perf-benchmarking.md.txt
@@ -110,7 +110,7 @@ of 128:128.
To run the benchmark from start to finish, run the following commands:
```shell
-python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt
+trtllm-bench --tokenizer meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000
trtllm-bench --model meta-llama/Llama-3.1-8B build --dataset /tmp/synthetic_128_128.txt --quantization FP8
trtllm-bench --model meta-llama/Llama-3.1-8B throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-3.1-8B/tp_1_pp_1
```
@@ -207,7 +207,7 @@ directory. For example, to generate a synthetic dataset of 1000 requests with a
128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run:
```shell
-benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt
+trtllm-bench --tokenizer meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000
```
### Building a Benchmark Engine
@@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp
- [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
`trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
-file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
+file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
above:
diff --git a/latest/_sources/legacy/reference/support-matrix.md.txt b/latest/_sources/legacy/reference/support-matrix.md.txt
index 1dc59fcfa0..24a3a01512 100644
--- a/latest/_sources/legacy/reference/support-matrix.md.txt
+++ b/latest/_sources/legacy/reference/support-matrix.md.txt
@@ -133,6 +133,7 @@ In addition, older architectures can have limitations for newer software release
* - GPU Model Architectures
-
- [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/)
+ - [NVIDIA GB300 NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72/)
- [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
- [NVIDIA Grace Hopper Superchip](https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/)
- [NVIDIA Hopper Architecture](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/)
diff --git a/latest/_sources/llm-api/reference.rst.txt b/latest/_sources/llm-api/reference.rst.txt
index 76a2c9f0e2..8816f4ccc3 100644
--- a/latest/_sources/llm-api/reference.rst.txt
+++ b/latest/_sources/llm-api/reference.rst.txt
@@ -17,6 +17,14 @@ API Reference
:member-order: groupwise
:inherited-members:
+.. autoclass:: tensorrt_llm.llmapi.AsyncLLM
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :special-members: __init__
+ :member-order: groupwise
+ :inherited-members:
+
.. autoclass:: tensorrt_llm.llmapi.MultimodalEncoder
:members:
:undoc-members:
@@ -288,7 +296,7 @@ API Reference
:special-members: __init__
:member-order: groupwise
:inherited-members:
- :exclude-members: model_parametrized_name,update_forward_refs,model_rebuild,parse_raw,from_orm,model_validate_strings,model_computed_fields,validate,model_post_init,model_copy,dict,schema,parse_obj,json,model_validate_json,copy,model_config,model_dump_json,model_fields,schema_json,construct,model_extra,model_json_schema,model_validate,model_dump,parse_file,model_fields_set,model_construct
+ :exclude-members: model_rebuild,model_fields_set,parse_obj,model_post_init,model_fields,validate,from_orm,update_forward_refs,model_dump_json,model_dump,parse_file,model_json_schema,model_parametrized_name,json,model_validate,model_config,model_copy,model_construct,parse_raw,model_validate_json,dict,construct,schema,copy,model_validate_strings,model_computed_fields,model_extra,schema_json
.. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
:members:
@@ -297,7 +305,7 @@ API Reference
:special-members: __init__
:member-order: groupwise
:inherited-members:
- :exclude-members: model_parametrized_name,update_forward_refs,model_rebuild,parse_raw,from_orm,model_validate_strings,model_computed_fields,validate,model_post_init,model_copy,dict,schema,parse_obj,json,model_validate_json,copy,model_config,model_dump_json,model_fields,schema_json,construct,model_extra,model_json_schema,model_validate,model_dump,parse_file,model_fields_set,model_construct
+ :exclude-members: model_rebuild,model_fields_set,parse_obj,model_post_init,model_fields,validate,from_orm,update_forward_refs,model_dump_json,model_dump,parse_file,model_json_schema,model_parametrized_name,json,model_validate,model_config,model_copy,model_construct,parse_raw,model_validate_json,dict,construct,schema,copy,model_validate_strings,model_computed_fields,model_extra,schema_json
.. autoclass:: tensorrt_llm.llmapi.AutoDecodingConfig
:members:
diff --git a/latest/_sources/models/supported-models.md.txt b/latest/_sources/models/supported-models.md.txt
index c6b6194b5d..40f3840073 100644
--- a/latest/_sources/models/supported-models.md.txt
+++ b/latest/_sources/models/supported-models.md.txt
@@ -8,6 +8,7 @@ The following is a table of supported models for the PyTorch backend:
| `BertForSequenceClassification` | BERT-based | `textattack/bert-base-uncased-yelp-polarity` |
| `DeciLMForCausalLM` | Nemotron | `nvidia/Llama-3_1-Nemotron-51B-Instruct` |
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3` |
+| `DeepseekV32ForCausalLM` | DeepSeek-V3.2 | `deepseek-ai/DeepSeek-V3.2` |
| `Exaone4ForCausalLM` | EXAONE 4.0 | `LGAI-EXAONE/EXAONE-4.0-32B` |
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it` |
| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b` |
@@ -34,6 +35,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
| Model Architecture/Feature | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Sliding Window Attention | Logits Post Processor | Guided Decoding |
| ------------------------------ | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | --- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ------------------------ | --------------------- | --------------- |
| `DeepseekV3ForCausalLM` | Yes | Yes | Yes | Yes | Yes [^1] | Yes | No | No | Yes | Yes | Yes [^2] | N/A | Yes | Yes |
+| `DeepseekV32ForCausalLM` | Yes | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | N/A | Yes | Yes |
| `Qwen3MoeForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | N/A | Yes | Yes |
| `Qwen3NextForCausalLM` | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested |
| `Llama4ForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Untested | N/A | Yes | Yes |
diff --git a/latest/_sources/overview.md.txt b/latest/_sources/overview.md.txt
index 0df4f72539..471e57ff23 100644
--- a/latest/_sources/overview.md.txt
+++ b/latest/_sources/overview.md.txt
@@ -4,7 +4,7 @@
## About TensorRT LLM
-[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs.
+[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs.
## Key Capabilities
@@ -40,7 +40,7 @@ TensorRT LLM strives to support the most popular models on **Day 0**.
### 🚀 **Advanced Optimization & Production Features**
- **[In-Flight Batching & Paged Attention](./features/paged-attention-ifb-scheduler.md)**: In-flight batching eliminates wait times by dynamically managing request execution, processing context and generation phases together for maximum GPU utilization and reduced latency.
- **[Multi-GPU Multi-Node Inference](./features/parallel-strategy.md)**: Seamless distributed inference with tensor, pipeline, and expert parallelism across multiple GPUs and nodes through the Model Definition API.
-- **[Advanced Quantization](./features/quantization.md)**:
+- **[Advanced Quantization](./features/quantization.md)**:
- **FP4 Quantization**: Native support on NVIDIA B200 GPUs with optimized FP4 kernels
- **FP8 Quantization**: Automatic conversion on NVIDIA H100 GPUs leveraging Hopper architecture
- **[Speculative Decoding](./features/speculative-decoding.md)**: Multiple algorithms including EAGLE, MTP and NGram
@@ -54,7 +54,7 @@ TensorRT LLM strives to support the most popular models on **Day 0**.
### 🔧 **Latest GPU Architecture Support**
TensorRT LLM supports the full spectrum of NVIDIA GPU architectures:
-- **NVIDIA Blackwell**: B200, GB200, RTX Pro 6000 SE with FP4 optimization
+- **NVIDIA Blackwell**: B200, GB200, B300, GB300, and RTX Pro 6000 SE with FP4 optimization
- **NVIDIA Hopper**: H100, H200,GH200 with FP8 acceleration
- **NVIDIA Ada Lovelace**: L40/L40S, RTX 40 series with FP8 acceleration
- **NVIDIA Ampere**: A100, RTX 30 series for production workloads
diff --git a/latest/_sources/quick-start-guide.md.txt b/latest/_sources/quick-start-guide.md.txt
index 088f70b3ea..6eff451feb 100644
--- a/latest/_sources/quick-start-guide.md.txt
+++ b/latest/_sources/quick-start-guide.md.txt
@@ -10,7 +10,7 @@ This is the starting point to try out TensorRT LLM. Specifically, this Quick Sta
The [TensorRT LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) maintained by NVIDIA contains all of the required dependencies pre-installed. You can start the container on a machine with NVIDIA GPUs via:
```bash
-docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5
+docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6
```
diff --git a/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt b/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt
index 4df92f0cf7..cf4c2c94dd 100644
--- a/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt
+++ b/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt
@@ -190,6 +190,25 @@ Specifies which sharding dimensions to apply during heuristic sharding. The avai
You can enable multiple dimensions simultaneously. For example, `['tp', 'ep']` will apply both tensor parallelism and expert parallelism.
+#### `process_grid` (dict, default: `None`)
+
+Specifies a 2D device mesh for hybrid EP+TP parallelism.
+
+- NOTE 1: This grid applies only to the MoE layers. Attention, Mamba, and MLP layers are unaffected.
+- NOTE 2: The order of the keys matters. Process grid's layout is in the generalized column-major order,
+ that is, the last dimension is stride-one.
+- NOTE 3: `ep * tp` must be equal to the provided world size. Otherwise, the mesh will be considered invalid,
+ and 1D ep-only parallelism will be applied.
+
+Example:
+
+```
+ process_grid: {'ep': 2, 'tp': 2}
+```
+
+If `world_size == 4`, ranks \[0,1\] and \[2,3\] will create two EP groups. Experts will be distributed across these two
+groups, and internally, TP=2 column-row sharding will be applied.
+
#### `requires_shape_prop` (bool, default: `true`)
Whether shape propagation is required before applying this transform. Shape propagation enables the transform to make informed decisions about sharding strategies based on tensor dimensions.
diff --git a/latest/_sources/torch/auto_deploy/support_matrix.md.txt b/latest/_sources/torch/auto_deploy/support_matrix.md.txt
index c8780cbca1..f0158253dd 100644
--- a/latest/_sources/torch/auto_deploy/support_matrix.md.txt
+++ b/latest/_sources/torch/auto_deploy/support_matrix.md.txt
@@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations:
### Precision Support
-AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer).
**Supported precision types include:**
diff --git a/latest/_sources/torch/features/quantization.md.txt b/latest/_sources/torch/features/quantization.md.txt
index a2b6c48be2..47cc745165 100644
--- a/latest/_sources/torch/features/quantization.md.txt
+++ b/latest/_sources/torch/features/quantization.md.txt
@@ -1,7 +1,7 @@
# Quantization
The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub,
-which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
```python
from tensorrt_llm._torch import LLM
@@ -12,7 +12,7 @@ llm.generate("Hello, my name is")
Or you can try the following commands to get a quantized model by yourself:
```bash
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/examples/llm_ptq
+git clone https://github.com/NVIDIA/Model-Optimizer.git
+cd Model-Optimizer/examples/llm_ptq
scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf
```
diff --git a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
index 25eacbb65f..0519f15432 100644
--- a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
+++ b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -573,7 +575,7 @@
Exploring more ISL/OSL combinations
@@ -612,8 +614,11 @@ For NVIDIA Hopper GPUs, it’s recommended to use the FP8 version of the DeepSee
YOUR_MODEL_PATH=<YOUR_MODEL_PATH>
cd $YOUR_MODEL_PATH
-## Download FP4 model for Blackwell GPUs
-git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4
+## Download NVFP4 model for Blackwell GPUs
+git clone https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2
+
+## Or the 0528 version
+git clone https://huggingface.co/nvidia/DeepSeek-R1-0528-NVFP4-v2
## Download FP8 model for Hopper GPUs
## FP8 model also works for Blackwell, but FP4 has the best performance on Blackwell.
@@ -784,13 +789,13 @@ trtllm-bench --model nvidia/DeepS
Benchmark
To do the benchmark, run the following command:
# generate synthetic dataset
-python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
- --stdout \
- --tokenizer nvidia/DeepSeek-R1-FP4 \
+trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
+ prepare-dataset \
+ --output dataset.txt \
token-norm-dist \
--input-mean 1024 --output-mean 2048 \
--input-stdev 0 --output-stdev 0 \
- --num-requests 49152 > dataset.txt
+ --num-requests 49152
YOUR_DATA_PATH=./dataset.txt
@@ -888,13 +893,14 @@ trtllm-bench --model deepseek-ai/D
Our benchmark results are based on Batch = 1024, ISL = 1K, OSL = 2K, num_requests = 5120 from real dataset
To do the benchmark, run the following command:
# generate synthetic dataset
-python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
- --stdout \
- --tokenizer deepseek-ai/DeepSeek-R1 \
+trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
+ prepare-dataset \
+ --output dataset.txt \
token-norm-dist \
--input-mean 1024 --output-mean 2048 \
--input-stdev 0 --output-stdev 0 \
- --num-requests 5120 > dataset.txt
+ --num-requests 5120
+
YOUR_DATA_PATH=./dataset.txt
cat >./extra-llm-api-config.yml<<EOF
@@ -941,10 +947,10 @@ trtllm-bench -m deepseek-ai/DeepSe
Exploring more ISL/OSL combinations
-To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use prepare_dataset.py to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.
+To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use the trtllm-bench prepare-dataset subcommand to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.
WIP: Enable more features by default
-Currently, there are some features that need to be enabled through a user-defined file extra-llm-api-config.yml, such as CUDA graph, overlap scheduler and attention dp. We’re working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
+Currently, there are some features that need to be enabled through a user-defined file extra-llm-api-config.yml, such as attention dp. We’re working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
Note that, max_batch_size and max_num_tokens can easily affect the performance. The default values for them are already carefully designed and should deliver good performance on overall cases, however, you may still need to tune it for peak performance.
Generally, you should make sure that max_batch_size is not too low to bottleneck the throughput, and max_num_tokens needs to be large enough so that it covers the max input sequence length of the samples in dataset, as mentioned in below section “WIP: Chunked context support on DeepSeek models”.
For more details on max_batch_size and max_num_tokens, refer to Tuning Max Batch Size and Max Num Tokens.
@@ -1142,9 +1148,9 @@ trtllm-bench -m deepseek-ai/DeepSe
diff --git a/latest/blogs/Falcon180B-H200.html b/latest/blogs/Falcon180B-H200.html
index b6d8714df6..5b2bd0fb2e 100644
--- a/latest/blogs/Falcon180B-H200.html
+++ b/latest/blogs/Falcon180B-H200.html
@@ -61,7 +61,7 @@
@@ -74,7 +74,7 @@
-
+
@@ -358,6 +358,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -366,6 +367,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -792,9 +794,9 @@ ISL = Input Sequence Length
diff --git a/latest/blogs/H100vsA100.html b/latest/blogs/H100vsA100.html
index df0126c3d4..5c36b72892 100644
--- a/latest/blogs/H100vsA100.html
+++ b/latest/blogs/H100vsA100.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -764,9 +766,9 @@
diff --git a/latest/blogs/H200launch.html b/latest/blogs/H200launch.html
index 141a3c5120..c3f3ac0b4b 100644
--- a/latest/blogs/H200launch.html
+++ b/latest/blogs/H200launch.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -756,9 +758,9 @@ TensorRT LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8.
diff --git a/latest/blogs/XQA-kernel.html b/latest/blogs/XQA-kernel.html
index 499db3c0cc..4c14c8754e 100644
--- a/latest/blogs/XQA-kernel.html
+++ b/latest/blogs/XQA-kernel.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -723,9 +725,9 @@ ISL = Input Sequence Length
diff --git a/latest/blogs/quantization-in-TRT-LLM.html b/latest/blogs/quantization-in-TRT-LLM.html
index d847c48431..5e2b5349a5 100644
--- a/latest/blogs/quantization-in-TRT-LLM.html
+++ b/latest/blogs/quantization-in-TRT-LLM.html
@@ -61,7 +61,7 @@
@@ -74,7 +74,7 @@
-
+
@@ -358,6 +358,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -366,6 +367,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -858,9 +860,9 @@
diff --git a/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html b/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html
index 1490d95381..45a685a126 100644
--- a/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html
+++ b/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html
@@ -63,7 +63,7 @@
@@ -78,7 +78,7 @@
-
+
@@ -362,6 +362,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -370,6 +371,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -1105,9 +1107,9 @@ The Pareto frontier analysis provides critical insights for real-world deploymen
diff --git a/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html b/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html
index 2255aff912..7ad6cd1172 100644
--- a/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html
+++ b/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -798,9 +800,9 @@ cat > /config/models/eagle/eagl
diff --git a/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html b/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html
index 765c4f42f7..5f7b666b8d 100644
--- a/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html
+++ b/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html
@@ -63,7 +63,7 @@
@@ -78,7 +78,7 @@
-
+
@@ -362,6 +362,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -370,6 +371,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -1019,9 +1021,9 @@
diff --git a/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html b/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html
index 6b8f5abc6f..7be3ab8af2 100644
--- a/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html
+++ b/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -1086,9 +1088,9 @@ is a certainty-based, training-free approach to accelerate Chain-of-Thought (CoT
diff --git a/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html b/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html
index 1e37727c86..f353f696d2 100644
--- a/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html
+++ b/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -578,7 +580,7 @@
wo GEMM FP4 quantization
The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1’s MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, “wo” is the abbreviation for the weight matrix for the output.
-We’ve evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The NVIDIA TensorRT Model Optimizer team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
+We’ve evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The NVIDIA Model Optimizer team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
Dynamo K8s Example
@@ -633,7 +635,7 @@
*TensorRT LLM already supports FP8 Attention while for this latency scenario low-precision attention computation doesn’t help with performance so we choose to use bf16 precision for the Attention Modules.
-
** nvfp4 model checkpoint is generated by the NVIDIA TensorRT Model Optimizer toolkit.
+
** nvfp4 model checkpoint is generated by the NVIDIA Model Optimizer toolkit.
*** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability
@@ -1199,9 +1201,9 @@
diff --git a/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html b/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html
index 4af90cf90b..7a89ed19f3 100644
--- a/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html
+++ b/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html
@@ -63,7 +63,7 @@
@@ -78,7 +78,7 @@
-
+
@@ -362,6 +362,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -370,6 +371,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -944,9 +946,9 @@ trtllm-bench --model nvidia/DeepSe
diff --git a/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html b/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html
index ce6fab6341..ccf5ca3729 100644
--- a/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html
+++ b/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html
@@ -61,7 +61,7 @@
@@ -76,7 +76,7 @@
-
+
@@ -360,6 +360,7 @@
Curl Chat Client
Curl Chat Client For Multimodal
Curl Completion Client
+Curl Responses Client
Deepseek R1 Reasoning Parser
Genai Perf Client
Genai Perf Client For Multimodal
@@ -368,6 +369,7 @@
OpenAI Completion Client
Openai Completion Client For Lora
OpenAI Completion Client with JSON Schema
+OpenAI Responses Client
Dynamo K8s Example
@@ -559,7 +561,7 @@
FP8 KV cache and FP8 attention, rather than BF16 precision.
FP4 Allgather for better communication bandwidth utilization.
-The checkpoint used in this blog is hosted in nvidia/DeepSeek-R1-FP4, generated by NVIDIA Model Optimizer. The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
+The checkpoint used in this blog is hosted in nvidia/DeepSeek-R1-FP4, generated by NVIDIA Model Optimizer. The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are: