1from tensorrt_llm import LLM, SamplingParams
- 2
+ 1from tensorrt_llm import SamplingParams
+ 2from tensorrt_llm._tensorrt_engine import LLM
3
- 4def main():
- 5
- 6 prompts = [
- 7 "Hello, my name is",
- 8 "The president of the United States is",
- 9 "The capital of France is",
-10 "The future of AI is",
-11 ]
-12 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-13
-14 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-15
-16 outputs = llm.generate(prompts, sampling_params)
-17
-18 # Print the outputs.
-19 for output in outputs:
-20 prompt = output.prompt
-21 generated_text = output.outputs[0].text
-22 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-23
+ 4
+ 5def main():
+ 6
+ 7 prompts = [
+ 8 "Hello, my name is",
+ 9 "The president of the United States is",
+10 "The capital of France is",
+11 "The future of AI is",
+12 ]
+13 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+14
+15 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+16
+17 outputs = llm.generate(prompts, sampling_params)
+18
+19 # Print the outputs.
+20 for output in outputs:
+21 prompt = output.prompt
+22 generated_text = output.outputs[0].text
+23 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
24
-25# The entry point of the program need to be protected for spawning processes.
-26if __name__ == '__main__':
-27 main()
+25
+26# The entry point of the program need to be protected for spawning processes.
+27if __name__ == '__main__':
+28 main()
The LLM API can be used for both offline or online usage. See more examples of the LLM API here:
@@ -698,9 +699,9 @@
diff --git a/latest/examples/llm_api_examples.html b/latest/examples/llm_api_examples.html
index 1f27a75b29..90988e1716 100644
--- a/latest/examples/llm_api_examples.html
+++ b/latest/examples/llm_api_examples.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -662,9 +662,9 @@
diff --git a/latest/examples/llm_auto_parallel.html b/latest/examples/llm_auto_parallel.html
index eaa7c918e7..88e7a90c85 100644
--- a/latest/examples/llm_auto_parallel.html
+++ b/latest/examples/llm_auto_parallel.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -514,40 +514,41 @@
Automatic Parallelism with LLM
Source NVIDIA/TensorRT-LLM.
1### Automatic Parallelism with LLM
- 2from tensorrt_llm import LLM, SamplingParams
- 3
+ 2from tensorrt_llm import SamplingParams
+ 3from tensorrt_llm._tensorrt_engine import LLM
4
- 5def main():
- 6 llm = LLM(
- 7 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- 8
- 9 # Enable auto parallelism
-10 auto_parallel=True,
-11 auto_parallel_world_size=2)
-12
-13 prompts = [
-14 "Hello, my name is",
-15 "The president of the United States is",
-16 "The capital of France is",
-17 "The future of AI is",
-18 ]
-19
-20 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-21
-22 for output in llm.generate(prompts, sampling_params):
-23 print(
-24 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-25 )
-26
-27 # Got output like
-28 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-29 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-30 # Prompt: 'The capital of France is', Generated text: 'Paris.'
-31 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-32
+ 5
+ 6def main():
+ 7 llm = LLM(
+ 8 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+ 9
+10 # Enable auto parallelism
+11 auto_parallel=True,
+12 auto_parallel_world_size=2)
+13
+14 prompts = [
+15 "Hello, my name is",
+16 "The president of the United States is",
+17 "The capital of France is",
+18 "The future of AI is",
+19 ]
+20
+21 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+22
+23 for output in llm.generate(prompts, sampling_params):
+24 print(
+25 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+26 )
+27
+28 # Got output like
+29 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+30 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+31 # Prompt: 'The capital of France is', Generated text: 'Paris.'
+32 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
33
-34if __name__ == '__main__':
-35 main()
+34
+35if __name__ == '__main__':
+36 main()
@@ -680,9 +681,9 @@
diff --git a/latest/examples/llm_eagle2_decoding.html b/latest/examples/llm_eagle2_decoding.html
index f22c7beb76..8e1312f242 100644
--- a/latest/examples/llm_eagle2_decoding.html
+++ b/latest/examples/llm_eagle2_decoding.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -515,8 +515,8 @@
Source NVIDIA/TensorRT-LLM.
1### Generate Text Using Eagle2 Decoding
2
- 3from tensorrt_llm import LLM, SamplingParams
- 4from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+ 3from tensorrt_llm._tensorrt_engine import LLM
+ 4from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
5 SamplingParams)
6
7
@@ -701,9 +701,9 @@
diff --git a/latest/examples/llm_eagle_decoding.html b/latest/examples/llm_eagle_decoding.html
index 7a02647a31..a131f06dc3 100644
--- a/latest/examples/llm_eagle_decoding.html
+++ b/latest/examples/llm_eagle_decoding.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -515,9 +515,9 @@
Source NVIDIA/TensorRT-LLM.
1### Generate Text Using Eagle Decoding
2
- 3from tensorrt_llm import LLM, SamplingParams
- 4from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
- 5 SamplingParams)
+ 3from tensorrt_llm import SamplingParams
+ 4from tensorrt_llm._tensorrt_engine import LLM
+ 5from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
6
7
8def main():
@@ -706,9 +706,9 @@
diff --git a/latest/examples/llm_guided_decoding.html b/latest/examples/llm_guided_decoding.html
index 16f0317ee2..979c18b758 100644
--- a/latest/examples/llm_guided_decoding.html
+++ b/latest/examples/llm_guided_decoding.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -514,50 +514,51 @@
Generate text with guided decoding
Source NVIDIA/TensorRT-LLM.
1### Generate text with guided decoding
- 2from tensorrt_llm import LLM, SamplingParams
- 3from tensorrt_llm.llmapi import GuidedDecodingParams
- 4
+ 2from tensorrt_llm import SamplingParams
+ 3from tensorrt_llm._tensorrt_engine import LLM
+ 4from tensorrt_llm.llmapi import GuidedDecodingParams
5
- 6def main():
- 7
- 8 # Specify the guided decoding backend; xgrammar is supported currently.
- 9 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-10 guided_decoding_backend='xgrammar')
-11
-12 # An example from json-mode-eval
-13 schema = '{"title": "WirelessAccessPoint", "type": "object", "properties": {"ssid": {"title": "SSID", "type": "string"}, "securityProtocol": {"title": "SecurityProtocol", "type": "string"}, "bandwidth": {"title": "Bandwidth", "type": "string"}}, "required": ["ssid", "securityProtocol", "bandwidth"]}'
-14
-15 prompt = [{
-16 'role':
-17 'system',
-18 'content':
-19 "You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{'title': 'WirelessAccessPoint', 'type': 'object', 'properties': {'ssid': {'title': 'SSID', 'type': 'string'}, 'securityProtocol': {'title': 'SecurityProtocol', 'type': 'string'}, 'bandwidth': {'title': 'Bandwidth', 'type': 'string'}}, 'required': ['ssid', 'securityProtocol', 'bandwidth']}\n</schema>\n"
-20 }, {
-21 'role':
-22 'user',
-23 'content':
-24 "I'm currently configuring a wireless access point for our office network and I need to generate a JSON object that accurately represents its settings. The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps on the 5 GHz band. This JSON object will be used to document our network configurations and to automate the setup process for additional access points in the future. Please provide a JSON object that includes these details."
-25 }]
-26 prompt = llm.tokenizer.apply_chat_template(prompt, tokenize=False)
-27 print(f"Prompt: {prompt!r}")
-28
-29 output = llm.generate(prompt, sampling_params=SamplingParams(max_tokens=50))
-30 print(f"Generated text (unguided): {output.outputs[0].text!r}")
-31
-32 output = llm.generate(
-33 prompt,
-34 sampling_params=SamplingParams(
-35 max_tokens=50, guided_decoding=GuidedDecodingParams(json=schema)))
-36 print(f"Generated text (guided): {output.outputs[0].text!r}")
-37
-38 # Got output like
-39 # Prompt: "<|system|>\nYou are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{'title': 'WirelessAccessPoint', 'type': 'object', 'properties': {'ssid': {'title': 'SSID', 'type': 'string'}, 'securityProtocol': {'title': 'SecurityProtocol', 'type': 'string'}, 'bandwidth': {'title': 'Bandwidth', 'type': 'string'}}, 'required': ['ssid', 'securityProtocol', 'bandwidth']}\n</schema>\n</s>\n<|user|>\nI'm currently configuring a wireless access point for our office network and I need to generate a JSON object that accurately represents its settings. The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps on the 5 GHz band. This JSON object will be used to document our network configurations and to automate the setup process for additional access points in the future. Please provide a JSON object that includes these details.</s>\n"
-40 # Generated text (unguided): '<|assistant|>\nHere\'s a JSON object that accurately represents the settings of a wireless access point for our office network:\n\n```json\n{\n "title": "WirelessAccessPoint",\n "'
-41 # Generated text (guided): '{"ssid": "OfficeNetSecure", "securityProtocol": "WPA2-Enterprise", "bandwidth": "1300 Mbps"}'
-42
+ 6
+ 7def main():
+ 8
+ 9 # Specify the guided decoding backend; xgrammar is supported currently.
+10 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+11 guided_decoding_backend='xgrammar')
+12
+13 # An example from json-mode-eval
+14 schema = '{"title": "WirelessAccessPoint", "type": "object", "properties": {"ssid": {"title": "SSID", "type": "string"}, "securityProtocol": {"title": "SecurityProtocol", "type": "string"}, "bandwidth": {"title": "Bandwidth", "type": "string"}}, "required": ["ssid", "securityProtocol", "bandwidth"]}'
+15
+16 prompt = [{
+17 'role':
+18 'system',
+19 'content':
+20 "You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{'title': 'WirelessAccessPoint', 'type': 'object', 'properties': {'ssid': {'title': 'SSID', 'type': 'string'}, 'securityProtocol': {'title': 'SecurityProtocol', 'type': 'string'}, 'bandwidth': {'title': 'Bandwidth', 'type': 'string'}}, 'required': ['ssid', 'securityProtocol', 'bandwidth']}\n</schema>\n"
+21 }, {
+22 'role':
+23 'user',
+24 'content':
+25 "I'm currently configuring a wireless access point for our office network and I need to generate a JSON object that accurately represents its settings. The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps on the 5 GHz band. This JSON object will be used to document our network configurations and to automate the setup process for additional access points in the future. Please provide a JSON object that includes these details."
+26 }]
+27 prompt = llm.tokenizer.apply_chat_template(prompt, tokenize=False)
+28 print(f"Prompt: {prompt!r}")
+29
+30 output = llm.generate(prompt, sampling_params=SamplingParams(max_tokens=50))
+31 print(f"Generated text (unguided): {output.outputs[0].text!r}")
+32
+33 output = llm.generate(
+34 prompt,
+35 sampling_params=SamplingParams(
+36 max_tokens=50, guided_decoding=GuidedDecodingParams(json=schema)))
+37 print(f"Generated text (guided): {output.outputs[0].text!r}")
+38
+39 # Got output like
+40 # Prompt: "<|system|>\nYou are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{'title': 'WirelessAccessPoint', 'type': 'object', 'properties': {'ssid': {'title': 'SSID', 'type': 'string'}, 'securityProtocol': {'title': 'SecurityProtocol', 'type': 'string'}, 'bandwidth': {'title': 'Bandwidth', 'type': 'string'}}, 'required': ['ssid', 'securityProtocol', 'bandwidth']}\n</schema>\n</s>\n<|user|>\nI'm currently configuring a wireless access point for our office network and I need to generate a JSON object that accurately represents its settings. The access point's SSID should be 'OfficeNetSecure', it uses WPA2-Enterprise as its security protocol, and it's capable of a bandwidth of up to 1300 Mbps on the 5 GHz band. This JSON object will be used to document our network configurations and to automate the setup process for additional access points in the future. Please provide a JSON object that includes these details.</s>\n"
+41 # Generated text (unguided): '<|assistant|>\nHere\'s a JSON object that accurately represents the settings of a wireless access point for our office network:\n\n```json\n{\n "title": "WirelessAccessPoint",\n "'
+42 # Generated text (guided): '{"ssid": "OfficeNetSecure", "securityProtocol": "WPA2-Enterprise", "bandwidth": "1300 Mbps"}'
43
-44if __name__ == '__main__':
-45 main()
+44
+45if __name__ == '__main__':
+46 main()
@@ -690,9 +691,9 @@
diff --git a/latest/examples/llm_inference.html b/latest/examples/llm_inference.html
index 024f0bf403..87bde1ace7 100644
--- a/latest/examples/llm_inference.html
+++ b/latest/examples/llm_inference.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -516,43 +516,44 @@
1### Generate text
2import tempfile
3
- 4from tensorrt_llm import LLM, SamplingParams
- 5
+ 4from tensorrt_llm import SamplingParams
+ 5from tensorrt_llm._tensorrt_engine import LLM
6
- 7def main():
- 8
- 9 # Model could accept HF model name, a path to local HF model,
-10 # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
-11 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-12
-13 # You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine.
-14 llm.save(tempfile.mkdtemp())
-15
-16 # Sample prompts.
-17 prompts = [
-18 "Hello, my name is",
-19 "The president of the United States is",
-20 "The capital of France is",
-21 "The future of AI is",
-22 ]
-23
-24 # Create a sampling params.
-25 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-26
-27 for output in llm.generate(prompts, sampling_params):
-28 print(
-29 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-30 )
-31
-32 # Got output like
-33 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-34 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-35 # Prompt: 'The capital of France is', Generated text: 'Paris.'
-36 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-37
+ 7
+ 8def main():
+ 9
+10 # Model could accept HF model name, a path to local HF model,
+11 # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+12 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+13
+14 # You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine.
+15 llm.save(tempfile.mkdtemp())
+16
+17 # Sample prompts.
+18 prompts = [
+19 "Hello, my name is",
+20 "The president of the United States is",
+21 "The capital of France is",
+22 "The future of AI is",
+23 ]
+24
+25 # Create a sampling params.
+26 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+27
+28 for output in llm.generate(prompts, sampling_params):
+29 print(
+30 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+31 )
+32
+33 # Got output like
+34 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+35 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+36 # Prompt: 'The capital of France is', Generated text: 'Paris.'
+37 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
38
-39if __name__ == '__main__':
-40 main()
+39
+40if __name__ == '__main__':
+41 main()
@@ -685,9 +686,9 @@
diff --git a/latest/examples/llm_inference_async.html b/latest/examples/llm_inference_async.html
index 2fa91c3e6a..a068397664 100644
--- a/latest/examples/llm_inference_async.html
+++ b/latest/examples/llm_inference_async.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -516,46 +516,47 @@
1### Generate Text Asynchronously
2import asyncio
3
- 4from tensorrt_llm import LLM, SamplingParams
- 5
+ 4from tensorrt_llm import SamplingParams
+ 5from tensorrt_llm._tensorrt_engine import LLM
6
- 7def main():
- 8 # model could accept HF model name or a path to local HF model.
- 9 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-10
-11 # Sample prompts.
-12 prompts = [
-13 "Hello, my name is",
-14 "The president of the United States is",
-15 "The capital of France is",
-16 "The future of AI is",
-17 ]
-18
-19 # Create a sampling params.
-20 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-21
-22 # Async based on Python coroutines
-23 async def task(prompt: str):
-24 output = await llm.generate_async(prompt, sampling_params)
-25 print(
-26 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-27 )
-28
-29 async def main():
-30 tasks = [task(prompt) for prompt in prompts]
-31 await asyncio.gather(*tasks)
-32
-33 asyncio.run(main())
-34
-35 # Got output like follows:
-36 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-37 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-38 # Prompt: 'The capital of France is', Generated text: 'Paris.'
-39 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-40
+ 7
+ 8def main():
+ 9 # model could accept HF model name or a path to local HF model.
+10 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+11
+12 # Sample prompts.
+13 prompts = [
+14 "Hello, my name is",
+15 "The president of the United States is",
+16 "The capital of France is",
+17 "The future of AI is",
+18 ]
+19
+20 # Create a sampling params.
+21 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+22
+23 # Async based on Python coroutines
+24 async def task(prompt: str):
+25 output = await llm.generate_async(prompt, sampling_params)
+26 print(
+27 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+28 )
+29
+30 async def main():
+31 tasks = [task(prompt) for prompt in prompts]
+32 await asyncio.gather(*tasks)
+33
+34 asyncio.run(main())
+35
+36 # Got output like follows:
+37 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+38 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+39 # Prompt: 'The capital of France is', Generated text: 'Paris.'
+40 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
41
-42if __name__ == '__main__':
-43 main()
+42
+43if __name__ == '__main__':
+44 main()
@@ -688,9 +689,9 @@
diff --git a/latest/examples/llm_inference_async_streaming.html b/latest/examples/llm_inference_async_streaming.html
index 1176418c2d..3e7dd02b14 100644
--- a/latest/examples/llm_inference_async_streaming.html
+++ b/latest/examples/llm_inference_async_streaming.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -516,66 +516,67 @@
1### Generate Text in Streaming
2import asyncio
3
- 4from tensorrt_llm import LLM, SamplingParams
- 5
+ 4from tensorrt_llm import SamplingParams
+ 5from tensorrt_llm._tensorrt_engine import LLM
6
- 7def main():
- 8
- 9 # model could accept HF model name or a path to local HF model.
-10 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-11
-12 # Sample prompts.
-13 prompts = [
-14 "Hello, my name is",
-15 "The president of the United States is",
-16 "The capital of France is",
-17 "The future of AI is",
-18 ]
-19
-20 # Create a sampling params.
-21 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-22
-23 # Async based on Python coroutines
-24 async def task(id: int, prompt: str):
-25
-26 # streaming=True is used to enable streaming generation.
-27 async for output in llm.generate_async(prompt,
-28 sampling_params,
-29 streaming=True):
-30 print(f"Generation for prompt-{id}: {output.outputs[0].text!r}")
-31
-32 async def main():
-33 tasks = [task(id, prompt) for id, prompt in enumerate(prompts)]
-34 await asyncio.gather(*tasks)
-35
-36 asyncio.run(main())
-37
-38 # Got output like follows:
-39 # Generation for prompt-0: '\n'
-40 # Generation for prompt-3: 'an'
-41 # Generation for prompt-2: 'Paris'
-42 # Generation for prompt-1: 'likely'
-43 # Generation for prompt-0: '\n\n'
-44 # Generation for prompt-3: 'an exc'
-45 # Generation for prompt-2: 'Paris.'
-46 # Generation for prompt-1: 'likely to'
-47 # Generation for prompt-0: '\n\nJ'
-48 # Generation for prompt-3: 'an exciting'
-49 # Generation for prompt-2: 'Paris.'
-50 # Generation for prompt-1: 'likely to nomin'
-51 # Generation for prompt-0: '\n\nJane'
-52 # Generation for prompt-3: 'an exciting time'
-53 # Generation for prompt-1: 'likely to nominate'
-54 # Generation for prompt-0: '\n\nJane Smith'
-55 # Generation for prompt-3: 'an exciting time for'
-56 # Generation for prompt-1: 'likely to nominate a'
-57 # Generation for prompt-0: '\n\nJane Smith.'
-58 # Generation for prompt-3: 'an exciting time for us'
-59 # Generation for prompt-1: 'likely to nominate a new'
-60
+ 7
+ 8def main():
+ 9
+10 # model could accept HF model name or a path to local HF model.
+11 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+12
+13 # Sample prompts.
+14 prompts = [
+15 "Hello, my name is",
+16 "The president of the United States is",
+17 "The capital of France is",
+18 "The future of AI is",
+19 ]
+20
+21 # Create a sampling params.
+22 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+23
+24 # Async based on Python coroutines
+25 async def task(id: int, prompt: str):
+26
+27 # streaming=True is used to enable streaming generation.
+28 async for output in llm.generate_async(prompt,
+29 sampling_params,
+30 streaming=True):
+31 print(f"Generation for prompt-{id}: {output.outputs[0].text!r}")
+32
+33 async def main():
+34 tasks = [task(id, prompt) for id, prompt in enumerate(prompts)]
+35 await asyncio.gather(*tasks)
+36
+37 asyncio.run(main())
+38
+39 # Got output like follows:
+40 # Generation for prompt-0: '\n'
+41 # Generation for prompt-3: 'an'
+42 # Generation for prompt-2: 'Paris'
+43 # Generation for prompt-1: 'likely'
+44 # Generation for prompt-0: '\n\n'
+45 # Generation for prompt-3: 'an exc'
+46 # Generation for prompt-2: 'Paris.'
+47 # Generation for prompt-1: 'likely to'
+48 # Generation for prompt-0: '\n\nJ'
+49 # Generation for prompt-3: 'an exciting'
+50 # Generation for prompt-2: 'Paris.'
+51 # Generation for prompt-1: 'likely to nomin'
+52 # Generation for prompt-0: '\n\nJane'
+53 # Generation for prompt-3: 'an exciting time'
+54 # Generation for prompt-1: 'likely to nominate'
+55 # Generation for prompt-0: '\n\nJane Smith'
+56 # Generation for prompt-3: 'an exciting time for'
+57 # Generation for prompt-1: 'likely to nominate a'
+58 # Generation for prompt-0: '\n\nJane Smith.'
+59 # Generation for prompt-3: 'an exciting time for us'
+60 # Generation for prompt-1: 'likely to nominate a new'
61
-62if __name__ == '__main__':
-63 main()
+62
+63if __name__ == '__main__':
+64 main()
@@ -708,9 +709,9 @@
diff --git a/latest/examples/llm_inference_customize.html b/latest/examples/llm_inference_customize.html
index a1653ac7af..e3cd8fe95b 100644
--- a/latest/examples/llm_inference_customize.html
+++ b/latest/examples/llm_inference_customize.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -516,59 +516,60 @@
1### Generate text with customization
2import tempfile
3
- 4from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
- 5
+ 4from tensorrt_llm._tensorrt_engine import LLM
+ 5from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
6
- 7def main():
- 8 # The end user can customize the build configuration with the build_config class and other arguments borrowed from the lower-level APIs
- 9 build_config = BuildConfig()
-10 build_config.max_batch_size = 128
-11 build_config.max_num_tokens = 2048
-12
-13 build_config.max_beam_width = 4
-14
-15 # Model could accept HF model name or a path to local HF model.
-16
-17 llm = LLM(
-18 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-19 build_config=build_config,
-20 kv_cache_config=KvCacheConfig(
-21 free_gpu_memory_fraction=0.8
-22 ), # Similar to `build_config`, you can also customize the runtime configuration with the `kv_cache_config`, `runtime_config`, `peft_cache_config` or \
-23 # other arguments borrowed from the lower-level APIs.
-24 )
-25
-26 # You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine.
-27 llm.save(tempfile.mkdtemp())
-28
-29 # Sample prompts.
-30 prompts = [
-31 "Hello, my name is",
-32 "The president of the United States is",
-33 "The capital of France is",
-34 "The future of AI is",
-35 ]
-36
-37 # With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on.
-38 sampling_params = SamplingParams(temperature=0.8,
-39 top_p=0.95,
-40 n=4,
-41 use_beam_search=True)
-42
-43 for output in llm.generate(prompts, sampling_params):
-44 print(
-45 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-46 )
-47
-48 # Got output like
-49 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-50 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-51 # Prompt: 'The capital of France is', Generated text: 'Paris.'
-52 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-53
+ 7
+ 8def main():
+ 9 # The end user can customize the build configuration with the build_config class and other arguments borrowed from the lower-level APIs
+10 build_config = BuildConfig()
+11 build_config.max_batch_size = 128
+12 build_config.max_num_tokens = 2048
+13
+14 build_config.max_beam_width = 4
+15
+16 # Model could accept HF model name or a path to local HF model.
+17
+18 llm = LLM(
+19 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+20 build_config=build_config,
+21 kv_cache_config=KvCacheConfig(
+22 free_gpu_memory_fraction=0.8
+23 ), # Similar to `build_config`, you can also customize the runtime configuration with the `kv_cache_config`, `runtime_config`, `peft_cache_config` or \
+24 # other arguments borrowed from the lower-level APIs.
+25 )
+26
+27 # You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine.
+28 llm.save(tempfile.mkdtemp())
+29
+30 # Sample prompts.
+31 prompts = [
+32 "Hello, my name is",
+33 "The president of the United States is",
+34 "The capital of France is",
+35 "The future of AI is",
+36 ]
+37
+38 # With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on.
+39 sampling_params = SamplingParams(temperature=0.8,
+40 top_p=0.95,
+41 n=4,
+42 use_beam_search=True)
+43
+44 for output in llm.generate(prompts, sampling_params):
+45 print(
+46 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+47 )
+48
+49 # Got output like
+50 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+51 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+52 # Prompt: 'The capital of France is', Generated text: 'Paris.'
+53 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
54
-55if __name__ == '__main__':
-56 main()
+55
+56if __name__ == '__main__':
+57 main()
@@ -701,9 +702,9 @@
diff --git a/latest/examples/llm_inference_distributed.html b/latest/examples/llm_inference_distributed.html
index 5c960b76e3..01411e9b57 100644
--- a/latest/examples/llm_inference_distributed.html
+++ b/latest/examples/llm_inference_distributed.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -514,49 +514,50 @@
Distributed LLM Generation
Source NVIDIA/TensorRT-LLM.
1### Distributed LLM Generation
- 2from tensorrt_llm import LLM, SamplingParams
- 3
+ 2from tensorrt_llm import SamplingParams
+ 3from tensorrt_llm._tensorrt_engine import LLM
4
- 5def main():
- 6 # model could accept HF model name or a path to local HF model.
- 7 llm = LLM(
- 8 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- 9 # Enable 2-way tensor parallelism
-10 tensor_parallel_size=2
-11 # Enable 2-way pipeline parallelism if needed
-12 # pipeline_parallel_size=2
-13 # Enable 2-way expert parallelism for MoE model's expert weights
-14 # moe_expert_parallel_size=2
-15 # Enable 2-way tensor parallelism for MoE model's expert weights
-16 # moe_tensor_parallel_size=2
-17 )
-18
-19 # Sample prompts.
-20 prompts = [
-21 "Hello, my name is",
-22 "The president of the United States is",
-23 "The capital of France is",
-24 "The future of AI is",
-25 ]
-26
-27 # Create a sampling params.
-28 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-29
-30 for output in llm.generate(prompts, sampling_params):
-31 print(
-32 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-33 )
-34
-35 # Got output like
-36 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-37 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-38 # Prompt: 'The capital of France is', Generated text: 'Paris.'
-39 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-40
+ 5
+ 6def main():
+ 7 # model could accept HF model name or a path to local HF model.
+ 8 llm = LLM(
+ 9 model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+10 # Enable 2-way tensor parallelism
+11 tensor_parallel_size=2
+12 # Enable 2-way pipeline parallelism if needed
+13 # pipeline_parallel_size=2
+14 # Enable 2-way expert parallelism for MoE model's expert weights
+15 # moe_expert_parallel_size=2
+16 # Enable 2-way tensor parallelism for MoE model's expert weights
+17 # moe_tensor_parallel_size=2
+18 )
+19
+20 # Sample prompts.
+21 prompts = [
+22 "Hello, my name is",
+23 "The president of the United States is",
+24 "The capital of France is",
+25 "The future of AI is",
+26 ]
+27
+28 # Create a sampling params.
+29 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+30
+31 for output in llm.generate(prompts, sampling_params):
+32 print(
+33 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+34 )
+35
+36 # Got output like
+37 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+38 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+39 # Prompt: 'The capital of France is', Generated text: 'Paris.'
+40 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
41
-42# The entry point of the program need to be protected for spawning processes.
-43if __name__ == '__main__':
-44 main()
+42
+43# The entry point of the program need to be protected for spawning processes.
+44if __name__ == '__main__':
+45 main()
@@ -689,9 +690,9 @@
diff --git a/latest/examples/llm_inference_kv_events.html b/latest/examples/llm_inference_kv_events.html
index deb9282607..1382c62fbc 100644
--- a/latest/examples/llm_inference_kv_events.html
+++ b/latest/examples/llm_inference_kv_events.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -515,52 +515,53 @@
Source NVIDIA/TensorRT-LLM.
1### Get KV Cache Events
2
- 3from tensorrt_llm import LLM, SamplingParams
- 4from tensorrt_llm.llmapi import KvCacheConfig
- 5
+ 3from tensorrt_llm import SamplingParams
+ 4from tensorrt_llm._tensorrt_engine import LLM
+ 5from tensorrt_llm.llmapi import KvCacheConfig
6
- 7def main():
- 8
- 9 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-10 tensor_parallel_size=2,
-11 autotuner_enabled=False,
-12 kv_cache_dtype='auto',
-13 kv_cache_config=KvCacheConfig(enable_block_reuse=True,
-14 event_buffer_max_size=1024),
-15 backend="pytorch")
-16
-17 # Sample prompts having a common prefix.
-18 common_prefix = (
-19 "After the ghost's departure, Barnardo notes Horatio's pale appearance and asks if he's okay. "
-20 "Horatio concedes that he's shaken and confesses that, without witnessing the ghost himself, he wouldn't have believed it existed. "
-21 "He's also disturbed by the ghost's striking resemblance to the king. It even seems to be wearing the former king's armor. "
-22 "Horatio thinks the ghost's presence foretells that something is about to go wrong in Denmark. "
-23 "Marcellus concurs with Horatio, as he and the other guards have observed that their schedules have become more rigorous and have also noticed the preparations taking place within Elsinore, including the building of cannons, the storing of weapons, and the preparation of ships."
-24 )
-25 prompts = [
-26 common_prefix, common_prefix + " Marcellus also notes that the king's"
-27 ]
-28
-29 # Create a sampling params.
-30 sampling_params = SamplingParams(temperature=0.001,
-31 top_p=0.001,
-32 max_tokens=5)
-33
-34 for output in llm.generate(prompts, sampling_params=sampling_params):
-35 print(
-36 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-37 )
-38
-39 kv_events = llm.get_kv_cache_events(10)
-40 print(kv_events)
-41
-42 # Got output like follows:
-43 # [{'event_id': 0, 'data': {'type': 'created', 'num_blocks_per_cache_level': [101230, 0]}},
-44 # {'event_id': 1, 'data': {'type': 'stored', 'parent_hash': None, 'blocks': [{'type': 'stored_block', 'block_hash': 4203099703668305365, 'tokens': [{'type': 'unique_token', 'token_id': 1, 'token_extra_id': 0}, ...
-45
+ 7
+ 8def main():
+ 9
+10 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+11 tensor_parallel_size=2,
+12 autotuner_enabled=False,
+13 kv_cache_dtype='auto',
+14 kv_cache_config=KvCacheConfig(enable_block_reuse=True,
+15 event_buffer_max_size=1024),
+16 backend="pytorch")
+17
+18 # Sample prompts having a common prefix.
+19 common_prefix = (
+20 "After the ghost's departure, Barnardo notes Horatio's pale appearance and asks if he's okay. "
+21 "Horatio concedes that he's shaken and confesses that, without witnessing the ghost himself, he wouldn't have believed it existed. "
+22 "He's also disturbed by the ghost's striking resemblance to the king. It even seems to be wearing the former king's armor. "
+23 "Horatio thinks the ghost's presence foretells that something is about to go wrong in Denmark. "
+24 "Marcellus concurs with Horatio, as he and the other guards have observed that their schedules have become more rigorous and have also noticed the preparations taking place within Elsinore, including the building of cannons, the storing of weapons, and the preparation of ships."
+25 )
+26 prompts = [
+27 common_prefix, common_prefix + " Marcellus also notes that the king's"
+28 ]
+29
+30 # Create a sampling params.
+31 sampling_params = SamplingParams(temperature=0.001,
+32 top_p=0.001,
+33 max_tokens=5)
+34
+35 for output in llm.generate(prompts, sampling_params=sampling_params):
+36 print(
+37 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+38 )
+39
+40 kv_events = llm.get_kv_cache_events(10)
+41 print(kv_events)
+42
+43 # Got output like follows:
+44 # [{'event_id': 0, 'data': {'type': 'created', 'num_blocks_per_cache_level': [101230, 0]}},
+45 # {'event_id': 1, 'data': {'type': 'stored', 'parent_hash': None, 'blocks': [{'type': 'stored_block', 'block_hash': 4203099703668305365, 'tokens': [{'type': 'unique_token', 'token_id': 1, 'token_extra_id': 0}, ...
46
-47if __name__ == '__main__':
-48 main()
+47
+48if __name__ == '__main__':
+49 main()
@@ -693,9 +694,9 @@
diff --git a/latest/examples/llm_logits_processor.html b/latest/examples/llm_logits_processor.html
index f175afd0af..a4385e1da5 100644
--- a/latest/examples/llm_logits_processor.html
+++ b/latest/examples/llm_logits_processor.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -518,7 +518,7 @@
3
4import torch
5
- 6from tensorrt_llm import LLM
+ 6from tensorrt_llm._tensorrt_engine import LLM
7from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
8 LogitsProcessor, SamplingParams)
9
@@ -762,9 +762,9 @@
diff --git a/latest/examples/llm_lookahead_decoding.html b/latest/examples/llm_lookahead_decoding.html
index 04ca558c3d..a2dfaa2a32 100644
--- a/latest/examples/llm_lookahead_decoding.html
+++ b/latest/examples/llm_lookahead_decoding.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -514,8 +514,8 @@
Generate Text Using Lookahead Decoding
Source NVIDIA/TensorRT-LLM.
1### Generate Text Using Lookahead Decoding
- 2from tensorrt_llm import LLM, SamplingParams
- 3from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+ 2from tensorrt_llm._tensorrt_engine import LLM
+ 3from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
4 LookaheadDecodingConfig, SamplingParams)
5
6
@@ -683,9 +683,9 @@
diff --git a/latest/examples/llm_medusa_decoding.html b/latest/examples/llm_medusa_decoding.html
index 83c8ef1c8e..c894804d22 100644
--- a/latest/examples/llm_medusa_decoding.html
+++ b/latest/examples/llm_medusa_decoding.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -517,8 +517,8 @@
2import argparse
3from pathlib import Path
4
- 5from tensorrt_llm import LLM, SamplingParams
- 6from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+ 5from tensorrt_llm._tensorrt_engine import LLM
+ 6from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
7 MedusaDecodingConfig, SamplingParams)
8
9
@@ -739,9 +739,9 @@
diff --git a/latest/examples/llm_mgmn_llm_distributed.html b/latest/examples/llm_mgmn_llm_distributed.html
index f2ad3ab65a..0c71e63723 100644
--- a/latest/examples/llm_mgmn_llm_distributed.html
+++ b/latest/examples/llm_mgmn_llm_distributed.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -698,9 +698,9 @@
diff --git a/latest/examples/llm_mgmn_trtllm_bench.html b/latest/examples/llm_mgmn_trtllm_bench.html
index 0c8cf6c6ae..b8d6007a3e 100644
--- a/latest/examples/llm_mgmn_trtllm_bench.html
+++ b/latest/examples/llm_mgmn_trtllm_bench.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -739,9 +739,9 @@
diff --git a/latest/examples/llm_mgmn_trtllm_serve.html b/latest/examples/llm_mgmn_trtllm_serve.html
index 7eaa54f2c8..4bfdbc3b13 100644
--- a/latest/examples/llm_mgmn_trtllm_serve.html
+++ b/latest/examples/llm_mgmn_trtllm_serve.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -700,9 +700,9 @@
diff --git a/latest/examples/llm_multilora.html b/latest/examples/llm_multilora.html
index 852b1eb3dd..3f0df74831 100644
--- a/latest/examples/llm_multilora.html
+++ b/latest/examples/llm_multilora.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -516,62 +516,63 @@
1### Generate text with multiple LoRA adapters
2from huggingface_hub import snapshot_download
3
- 4from tensorrt_llm import LLM, BuildConfig
+ 4from tensorrt_llm._tensorrt_engine import LLM
5from tensorrt_llm.executor import LoRARequest
- 6from tensorrt_llm.lora_manager import LoraConfig
- 7
+ 6from tensorrt_llm.llmapi import BuildConfig
+ 7from tensorrt_llm.lora_manager import LoraConfig
8
- 9def main():
-10
-11 # Download the LoRA adapters from huggingface hub.
-12 lora_dir1 = snapshot_download(repo_id="snshrivas10/sft-tiny-chatbot")
-13 lora_dir2 = snapshot_download(
-14 repo_id="givyboy/TinyLlama-1.1B-Chat-v1.0-mental-health-conversational")
-15 lora_dir3 = snapshot_download(repo_id="barissglc/tinyllama-tarot-v1")
-16
-17 # Currently, we need to pass at least one lora_dir to LLM constructor via build_config.lora_config.
-18 # This is necessary because it requires some configuration in the lora_dir to build the engine with LoRA support.
-19 build_config = BuildConfig()
-20 build_config.lora_config = LoraConfig(lora_dir=[lora_dir1])
-21 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-22 enable_lora=True,
-23 max_lora_rank=64,
-24 build_config=build_config)
-25
-26 # Sample prompts
-27 prompts = [
-28 "Hello, tell me a story: ",
+ 9
+10def main():
+11
+12 # Download the LoRA adapters from huggingface hub.
+13 lora_dir1 = snapshot_download(repo_id="snshrivas10/sft-tiny-chatbot")
+14 lora_dir2 = snapshot_download(
+15 repo_id="givyboy/TinyLlama-1.1B-Chat-v1.0-mental-health-conversational")
+16 lora_dir3 = snapshot_download(repo_id="barissglc/tinyllama-tarot-v1")
+17
+18 # Currently, we need to pass at least one lora_dir to LLM constructor via build_config.lora_config.
+19 # This is necessary because it requires some configuration in the lora_dir to build the engine with LoRA support.
+20 build_config = BuildConfig()
+21 build_config.lora_config = LoraConfig(lora_dir=[lora_dir1])
+22 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+23 enable_lora=True,
+24 max_lora_rank=64,
+25 build_config=build_config)
+26
+27 # Sample prompts
+28 prompts = [
29 "Hello, tell me a story: ",
-30 "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?",
+30 "Hello, tell me a story: ",
31 "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?",
-32 "In this reading, the Justice card represents a situation where",
+32 "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?",
33 "In this reading, the Justice card represents a situation where",
-34 ]
-35
-36 # At runtime, multiple LoRA adapters can be specified via lora_request; None means no LoRA used.
-37 for output in llm.generate(prompts,
-38 lora_request=[
-39 None,
-40 LoRARequest("chatbot", 1, lora_dir1), None,
-41 LoRARequest("mental-health", 2, lora_dir2),
-42 None,
-43 LoRARequest("tarot", 3, lora_dir3)
-44 ]):
-45 prompt = output.prompt
-46 generated_text = output.outputs[0].text
-47 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-48
-49 # Got output like
-50 # Prompt: 'Hello, tell me a story: ', Generated text: '1. Start with a question: "What\'s your favorite color?" 2. Ask a question that leads to a story: "What\'s your'
-51 # Prompt: 'Hello, tell me a story: ', Generated text: '1. A person is walking down the street. 2. A person is sitting on a bench. 3. A person is reading a book.'
-52 # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (smiling) No, I'm just feeling a bit overwhelmed lately. I've been trying to"
-53 # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (sighs) Yeah, I've been struggling with some personal issues. I've been feeling like I'm"
-54 # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. The card suggests that you should take the time to consider all the options'
-55 # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. It is important to take the time to consider all the options and make'
-56
+34 "In this reading, the Justice card represents a situation where",
+35 ]
+36
+37 # At runtime, multiple LoRA adapters can be specified via lora_request; None means no LoRA used.
+38 for output in llm.generate(prompts,
+39 lora_request=[
+40 None,
+41 LoRARequest("chatbot", 1, lora_dir1), None,
+42 LoRARequest("mental-health", 2, lora_dir2),
+43 None,
+44 LoRARequest("tarot", 3, lora_dir3)
+45 ]):
+46 prompt = output.prompt
+47 generated_text = output.outputs[0].text
+48 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+49
+50 # Got output like
+51 # Prompt: 'Hello, tell me a story: ', Generated text: '1. Start with a question: "What\'s your favorite color?" 2. Ask a question that leads to a story: "What\'s your'
+52 # Prompt: 'Hello, tell me a story: ', Generated text: '1. A person is walking down the street. 2. A person is sitting on a bench. 3. A person is reading a book.'
+53 # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (smiling) No, I'm just feeling a bit overwhelmed lately. I've been trying to"
+54 # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (sighs) Yeah, I've been struggling with some personal issues. I've been feeling like I'm"
+55 # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. The card suggests that you should take the time to consider all the options'
+56 # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. It is important to take the time to consider all the options and make'
57
-58if __name__ == '__main__':
-59 main()
+58
+59if __name__ == '__main__':
+60 main()
@@ -704,9 +705,9 @@
diff --git a/latest/examples/llm_quantization.html b/latest/examples/llm_quantization.html
index 13ca60d8cd..b1d390169f 100644
--- a/latest/examples/llm_quantization.html
+++ b/latest/examples/llm_quantization.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -518,82 +518,83 @@
3
4import torch
5
- 6from tensorrt_llm import LLM, SamplingParams
- 7from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
- 8
- 9major, minor = torch.cuda.get_device_capability()
-10enable_fp8 = major > 8 or (major == 8 and minor >= 9)
-11enable_nvfp4 = major >= 10
-12
-13quant_and_calib_configs = []
-14
-15if not enable_nvfp4:
-16 # Example 1: Specify int4 AWQ quantization to QuantConfig.
-17 # We can skip specifying CalibConfig or leave a None as the default value.
-18 quant_and_calib_configs.append(
-19 (QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ), None))
-20
-21if enable_fp8:
-22 # Example 2: Specify FP8 quantization to QuantConfig.
-23 # We can create a CalibConfig to specify the calibration dataset and other details.
-24 # Note that the calibration dataset could be either HF dataset name or a path to local HF dataset.
-25 quant_and_calib_configs.append(
-26 (QuantConfig(quant_algo=QuantAlgo.FP8,
-27 kv_cache_quant_algo=QuantAlgo.FP8),
-28 CalibConfig(calib_dataset='cnn_dailymail',
-29 calib_batches=256,
-30 calib_max_seq_length=256)))
-31else:
-32 logging.error(
-33 "FP8 quantization only works on post-ada GPUs. Skipped in the example.")
-34
-35if enable_nvfp4:
-36 # Example 3: Specify NVFP4 quantization to QuantConfig.
-37 quant_and_calib_configs.append(
-38 (QuantConfig(quant_algo=QuantAlgo.NVFP4,
-39 kv_cache_quant_algo=QuantAlgo.FP8),
-40 CalibConfig(calib_dataset='cnn_dailymail',
-41 calib_batches=256,
-42 calib_max_seq_length=256)))
-43else:
-44 logging.error(
-45 "NVFP4 quantization only works on Blackwell. Skipped in the example.")
-46
+ 6from tensorrt_llm import SamplingParams
+ 7from tensorrt_llm._tensorrt_engine import LLM
+ 8from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
+ 9
+10major, minor = torch.cuda.get_device_capability()
+11enable_fp8 = major > 8 or (major == 8 and minor >= 9)
+12enable_nvfp4 = major >= 10
+13
+14quant_and_calib_configs = []
+15
+16if not enable_nvfp4:
+17 # Example 1: Specify int4 AWQ quantization to QuantConfig.
+18 # We can skip specifying CalibConfig or leave a None as the default value.
+19 quant_and_calib_configs.append(
+20 (QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ), None))
+21
+22if enable_fp8:
+23 # Example 2: Specify FP8 quantization to QuantConfig.
+24 # We can create a CalibConfig to specify the calibration dataset and other details.
+25 # Note that the calibration dataset could be either HF dataset name or a path to local HF dataset.
+26 quant_and_calib_configs.append(
+27 (QuantConfig(quant_algo=QuantAlgo.FP8,
+28 kv_cache_quant_algo=QuantAlgo.FP8),
+29 CalibConfig(calib_dataset='cnn_dailymail',
+30 calib_batches=256,
+31 calib_max_seq_length=256)))
+32else:
+33 logging.error(
+34 "FP8 quantization only works on post-ada GPUs. Skipped in the example.")
+35
+36if enable_nvfp4:
+37 # Example 3: Specify NVFP4 quantization to QuantConfig.
+38 quant_and_calib_configs.append(
+39 (QuantConfig(quant_algo=QuantAlgo.NVFP4,
+40 kv_cache_quant_algo=QuantAlgo.FP8),
+41 CalibConfig(calib_dataset='cnn_dailymail',
+42 calib_batches=256,
+43 calib_max_seq_length=256)))
+44else:
+45 logging.error(
+46 "NVFP4 quantization only works on Blackwell. Skipped in the example.")
47
-48def main():
-49
-50 for quant_config, calib_config in quant_and_calib_configs:
-51 # The built-in end-to-end quantization is triggered according to the passed quant_config.
-52 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-53 quant_config=quant_config,
-54 calib_config=calib_config)
-55
-56 # Sample prompts.
-57 prompts = [
-58 "Hello, my name is",
-59 "The president of the United States is",
-60 "The capital of France is",
-61 "The future of AI is",
-62 ]
-63
-64 # Create a sampling params.
-65 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-66
-67 for output in llm.generate(prompts, sampling_params):
-68 print(
-69 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-70 )
-71 llm.shutdown()
-72
-73 # Got output like
-74 # Prompt: 'Hello, my name is', Generated text: 'Jane Smith. I am a resident of the city. Can you tell me more about the public services provided in the area?'
-75 # Prompt: 'The president of the United States is', Generated text: 'considered the head of state, and the vice president of the United States is considered the head of state. President and Vice President of the United States (US)'
-76 # Prompt: 'The capital of France is', Generated text: 'located in Paris, France. The population of Paris, France, is estimated to be 2 million. France is home to many famous artists, including Picasso'
-77 # Prompt: 'The future of AI is', Generated text: 'an open and collaborative project. The project is an ongoing effort, and we invite participation from members of the community.\n\nOur community is'
-78
+48
+49def main():
+50
+51 for quant_config, calib_config in quant_and_calib_configs:
+52 # The built-in end-to-end quantization is triggered according to the passed quant_config.
+53 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+54 quant_config=quant_config,
+55 calib_config=calib_config)
+56
+57 # Sample prompts.
+58 prompts = [
+59 "Hello, my name is",
+60 "The president of the United States is",
+61 "The capital of France is",
+62 "The future of AI is",
+63 ]
+64
+65 # Create a sampling params.
+66 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+67
+68 for output in llm.generate(prompts, sampling_params):
+69 print(
+70 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+71 )
+72 llm.shutdown()
+73
+74 # Got output like
+75 # Prompt: 'Hello, my name is', Generated text: 'Jane Smith. I am a resident of the city. Can you tell me more about the public services provided in the area?'
+76 # Prompt: 'The president of the United States is', Generated text: 'considered the head of state, and the vice president of the United States is considered the head of state. President and Vice President of the United States (US)'
+77 # Prompt: 'The capital of France is', Generated text: 'located in Paris, France. The population of Paris, France, is estimated to be 2 million. France is home to many famous artists, including Picasso'
+78 # Prompt: 'The future of AI is', Generated text: 'an open and collaborative project. The project is an ongoing effort, and we invite participation from members of the community.\n\nOur community is'
79
-80if __name__ == '__main__':
-81 main()
+80
+81if __name__ == '__main__':
+82 main()
@@ -726,9 +727,9 @@
diff --git a/latest/examples/openai_chat_client.html b/latest/examples/openai_chat_client.html
index 6f37ec8cb1..43ce931dfe 100644
--- a/latest/examples/openai_chat_client.html
+++ b/latest/examples/openai_chat_client.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -667,9 +667,9 @@
diff --git a/latest/examples/openai_chat_client_for_multimodal.html b/latest/examples/openai_chat_client_for_multimodal.html
index 33eecef3a6..a6bfd11067 100644
--- a/latest/examples/openai_chat_client_for_multimodal.html
+++ b/latest/examples/openai_chat_client_for_multimodal.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -760,9 +760,9 @@
diff --git a/latest/examples/openai_completion_client.html b/latest/examples/openai_completion_client.html
index 6fdefe3a75..585a45b393 100644
--- a/latest/examples/openai_completion_client.html
+++ b/latest/examples/openai_completion_client.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -661,9 +661,9 @@
diff --git a/latest/examples/trtllm_serve_examples.html b/latest/examples/trtllm_serve_examples.html
index a7c7cdd35a..6cff8b05e7 100644
--- a/latest/examples/trtllm_serve_examples.html
+++ b/latest/examples/trtllm_serve_examples.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -653,9 +653,9 @@
diff --git a/latest/genindex.html b/latest/genindex.html
index d278607b3a..0897712e66 100644
--- a/latest/genindex.html
+++ b/latest/genindex.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -750,8 +750,6 @@
(tensorrt_llm.llmapi.KvCacheRetentionConfig method)
(tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig method)
-
-
(tensorrt_llm.llmapi.LLM method)
(tensorrt_llm.llmapi.LookaheadDecodingConfig method)
@@ -943,7 +941,7 @@
cache_root (tensorrt_llm.llmapi.BuildCacheConfig attribute)
CacheTransceiverConfig (class in tensorrt_llm.llmapi)
@@ -1234,9 +1232,11 @@
(tensorrt_llm.llmapi.TrtLlmArgs attribute)
-
decoding_type (tensorrt_llm.llmapi.EagleDecodingConfig attribute)
+ decoding_type (tensorrt_llm.llmapi.DraftTargetDecodingConfig attribute)
fc_gate_plugin() (tensorrt_llm.layers.mlp.FusedGatedMLP method)
-
field_name (tensorrt_llm.llmapi.TorchLlmArgs attribute), [1], [2], [3]
+ field_name (tensorrt_llm.llmapi.TorchLlmArgs attribute), [1], [2], [3]
- - (tensorrt_llm.llmapi.TrtLlmArgs attribute), [1], [2], [3], [4], [5]
+
- (tensorrt_llm.llmapi.TrtLlmArgs attribute), [1], [2], [3], [4], [5]
fill_attention_const_params_for_long_rope() (tensorrt_llm.layers.attention.AttentionParams method)
@@ -1670,6 +1680,8 @@
- (tensorrt_llm.llmapi.CalibConfig class method)
+
+ - (tensorrt_llm.llmapi.DraftTargetDecodingConfig class method)
- (tensorrt_llm.llmapi.EagleDecodingConfig class method)
@@ -2203,7 +2215,7 @@
- length (tensorrt_llm.llmapi.CompletionOutput attribute)
- length_penalty (tensorrt_llm.llmapi.SamplingParams attribute)
@@ -2293,7 +2305,7 @@
- logprobs_diff (tensorrt_llm.llmapi.CompletionOutput attribute)
- long_rope (tensorrt_llm.functional.PositionEmbeddingType attribute)
@@ -2375,7 +2387,7 @@
- max_cache_storage_gb (tensorrt_llm.llmapi.BuildCacheConfig attribute)
- max_cpu_loras (tensorrt_llm.llmapi.TorchLlmArgs attribute)
@@ -2435,7 +2447,7 @@
- max_records (tensorrt_llm.llmapi.BuildCacheConfig attribute)
- max_seq_len (tensorrt_llm.llmapi.BuildConfig attribute)
@@ -2537,6 +2549,8 @@
- MropeParams (class in tensorrt_llm.layers.attention)
- - msg (tensorrt_llm.llmapi.TorchLlmArgs attribute), [1], [2], [3]
+
- msg (tensorrt_llm.llmapi.TorchLlmArgs attribute), [1], [2], [3]
- - (tensorrt_llm.llmapi.TrtLlmArgs attribute), [1], [2], [3], [4], [5]
+
- (tensorrt_llm.llmapi.TrtLlmArgs attribute), [1], [2], [3], [4], [5]
- MTPDecodingConfig (class in tensorrt_llm.llmapi)
@@ -2945,7 +2959,7 @@
- prompt (tensorrt_llm.llmapi.RequestOutput attribute)
- prompt_logprobs (tensorrt_llm.llmapi.CompletionOutput attribute)
@@ -2972,8 +2986,12 @@
- python_e2e (tensorrt_llm.runtime.MultimodalModelRunner property)
- - pytorch_eagle_weights_path (tensorrt_llm.llmapi.EagleDecodingConfig attribute)
+
- pytorch_weights_path (tensorrt_llm.llmapi.DraftTargetDecodingConfig attribute)
+
+
@@ -3205,8 +3223,6 @@
SamplingConfig (class in tensorrt_llm.runtime)
SamplingParams (class in tensorrt_llm.llmapi)
-
-
save() (tensorrt_llm.llmapi.LLM method)
save_checkpoint() (tensorrt_llm.models.LlavaNextVisionWrapper method)
@@ -3300,10 +3316,10 @@
size() (tensorrt_llm.functional.Tensor method)
-
-
|
+
|
+
|
-
- tensorrt_llm::runtime::CudaEvent (C++ class)
- tensorrt_llm::runtime::CudaEvent::CudaEvent (C++ function), [1]
@@ -8694,7 +8714,7 @@
- text_diff (tensorrt_llm.llmapi.CompletionOutput attribute)
- TimestepEmbedding (class in tensorrt_llm.layers.embedding)
@@ -8744,7 +8764,7 @@
- token_ids_diff (tensorrt_llm.llmapi.CompletionOutput attribute)
- token_range_retention_configs (tensorrt_llm.llmapi.KvCacheRetentionConfig property)
@@ -8800,14 +8820,6 @@
- topk() (in module tensorrt_llm.functional)
- torch_compile_config (tensorrt_llm.llmapi.TorchLlmArgs attribute)
-
- - torch_compile_enable_userbuffers (tensorrt_llm.llmapi.TorchCompileConfig attribute)
-
- - torch_compile_fullgraph (tensorrt_llm.llmapi.TorchCompileConfig attribute)
-
- - torch_compile_inductor_enabled (tensorrt_llm.llmapi.TorchCompileConfig attribute)
-
- - torch_compile_piecewise_cuda_graph (tensorrt_llm.llmapi.TorchCompileConfig attribute)
- TorchCompileConfig (class in tensorrt_llm.llmapi)
@@ -9007,6 +9019,8 @@
- use_meta_recipe (tensorrt_llm.llmapi.QuantConfig attribute)
- use_mrope (tensorrt_llm.llmapi.BuildConfig attribute)
+
+ - use_mtp_vanilla (tensorrt_llm.llmapi.MTPDecodingConfig attribute)
- use_prompt_tuning() (tensorrt_llm.models.EncoderModel method)
@@ -9033,6 +9047,8 @@
- validate_moe_load_balancer() (tensorrt_llm.llmapi.TorchLlmArgs method)
- validate_positive_values() (tensorrt_llm.llmapi.LookaheadDecodingConfig class method)
+
+ - validate_stream_interval() (tensorrt_llm.llmapi.TorchLlmArgs method)
- VERBATIM (tensorrt_llm.models.GemmaConfig attribute)
@@ -9097,11 +9113,11 @@
- W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)
- W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)
-
- - W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)
|
+ - W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)
+
- W8A8_SQ_PER_TENSOR_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)
- weight_loader() (tensorrt_llm.layers.attention.DeepseekV2Attention method)
@@ -9120,18 +9136,12 @@
- WhisperEncoder (class in tensorrt_llm.models)
- - workspace (tensorrt_llm.llmapi.LLM attribute)
+
- workspace (tensorrt_llm.llmapi.TrtLlmArgs attribute)
+
+ - wrapped_property (tensorrt_llm.llmapi.TorchLlmArgs attribute), [1], [2], [3]
- - wrapped_property (tensorrt_llm.llmapi.TorchLlmArgs attribute), [1], [2], [3]
-
-
- - (tensorrt_llm.llmapi.TrtLlmArgs attribute), [1], [2], [3], [4], [5]
+
- (tensorrt_llm.llmapi.TrtLlmArgs attribute), [1], [2], [3], [4], [5]
|
@@ -9260,9 +9270,9 @@
diff --git a/latest/index.html b/latest/index.html
index 02e80a45b2..979739ea7c 100644
--- a/latest/index.html
+++ b/latest/index.html
@@ -51,7 +51,7 @@
@@ -62,7 +62,7 @@
-
+
@@ -916,9 +916,9 @@
diff --git a/latest/installation/build-from-source-linux.html b/latest/installation/build-from-source-linux.html
index 611fde6857..5fbb038d05 100644
--- a/latest/installation/build-from-source-linux.html
+++ b/latest/installation/build-from-source-linux.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -513,7 +513,7 @@
Building from Source Code on Linux
-This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different GNU CXX11 ABI configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0, which uses the new CXX11 ABI.
+This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different GNU CXX11 ABI configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.
Prerequisites
Use Docker to build and run TensorRT-LLM. Instructions to install an environment to run Docker containers for the NVIDIA platform can be found here.
@@ -856,9 +856,9 @@ pip install ./build/tensorrt_llm*.
diff --git a/latest/installation/grace-hopper.html b/latest/installation/grace-hopper.html
index 5603eba61f..adb630e5ad 100644
--- a/latest/installation/grace-hopper.html
+++ b/latest/installation/grace-hopper.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -511,7 +511,7 @@
Installing on Grace Hopper
Install TensorRT-LLM (tested on Ubuntu 24.04).
-pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
@@ -519,33 +519,34 @@ sudo
apt-get
-y
If using the PyTorch NGC Container image, the prerequisite steps for installing CUDA-enabled PyTorch package and libopenmpi-dev are not required.
Sanity check the installation by running the following in Python (tested on Python 3.12):
- 1from tensorrt_llm import LLM, SamplingParams
- 2
+ 1from tensorrt_llm import SamplingParams
+ 2from tensorrt_llm._tensorrt_engine import LLM
3
- 4def main():
- 5
- 6 prompts = [
- 7 "Hello, my name is",
- 8 "The president of the United States is",
- 9 "The capital of France is",
-10 "The future of AI is",
-11 ]
-12 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-13
-14 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-15
-16 outputs = llm.generate(prompts, sampling_params)
-17
-18 # Print the outputs.
-19 for output in outputs:
-20 prompt = output.prompt
-21 generated_text = output.outputs[0].text
-22 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-23
+ 4
+ 5def main():
+ 6
+ 7 prompts = [
+ 8 "Hello, my name is",
+ 9 "The president of the United States is",
+10 "The capital of France is",
+11 "The future of AI is",
+12 ]
+13 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+14
+15 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+16
+17 outputs = llm.generate(prompts, sampling_params)
+18
+19 # Print the outputs.
+20 for output in outputs:
+21 prompt = output.prompt
+22 generated_text = output.outputs[0].text
+23 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
24
-25# The entry point of the program need to be protected for spawning processes.
-26if __name__ == '__main__':
-27 main()
+25
+26# The entry point of the program need to be protected for spawning processes.
+27if __name__ == '__main__':
+28 main()
@@ -680,9 +681,9 @@ sudo apt-get -y
diff --git a/latest/installation/linux.html b/latest/installation/linux.html
index 42e019971d..3840a87b86 100644
--- a/latest/installation/linux.html
+++ b/latest/installation/linux.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -511,7 +511,7 @@
Installing on Linux
Install TensorRT-LLM (tested on Ubuntu 24.04).
-(Optional) pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+(Optional) pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
@@ -520,33 +520,34 @@ sudo
apt-get
-y
If using the PyTorch NGC Container image, the prerequisite steps for installing NVIDIA Blackwell-enabled PyTorch package and libopenmpi-dev are not required.
Sanity check the installation by running the following in Python (tested on Python 3.12):
- 1from tensorrt_llm import LLM, SamplingParams
- 2
+ 1from tensorrt_llm import SamplingParams
+ 2from tensorrt_llm._tensorrt_engine import LLM
3
- 4def main():
- 5
- 6 prompts = [
- 7 "Hello, my name is",
- 8 "The president of the United States is",
- 9 "The capital of France is",
-10 "The future of AI is",
-11 ]
-12 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-13
-14 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-15
-16 outputs = llm.generate(prompts, sampling_params)
-17
-18 # Print the outputs.
-19 for output in outputs:
-20 prompt = output.prompt
-21 generated_text = output.outputs[0].text
-22 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-23
+ 4
+ 5def main():
+ 6
+ 7 prompts = [
+ 8 "Hello, my name is",
+ 9 "The president of the United States is",
+10 "The capital of France is",
+11 "The future of AI is",
+12 ]
+13 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+14
+15 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+16
+17 outputs = llm.generate(prompts, sampling_params)
+18
+19 # Print the outputs.
+20 for output in outputs:
+21 prompt = output.prompt
+22 generated_text = output.outputs[0].text
+23 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
24
-25# The entry point of the program need to be protected for spawning processes.
-26if __name__ == '__main__':
-27 main()
+25
+26# The entry point of the program need to be protected for spawning processes.
+27if __name__ == '__main__':
+28 main()
@@ -716,9 +717,9 @@ Please install CUDA toolkit when you see the following message when running Mode
diff --git a/latest/key-features.html b/latest/key-features.html
index c0b1c4e105..7d95dae147 100644
--- a/latest/key-features.html
+++ b/latest/key-features.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -648,9 +648,9 @@
diff --git a/latest/llm-api/index.html b/latest/llm-api/index.html
index a0f6bbc0bb..cd9654fd3d 100644
--- a/latest/llm-api/index.html
+++ b/latest/llm-api/index.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -763,9 +763,9 @@ Refer to the
diff --git a/latest/llm-api/reference.html b/latest/llm-api/reference.html
index 72818510b1..9251b77c9a 100644
--- a/latest/llm-api/reference.html
+++ b/latest/llm-api/reference.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -531,8 +531,9 @@
)[source]
-
Bases: object
+Bases: _TorchLLM
LLM class is the main class for running a LLM model.
+This class is an alias of TorchLLM.
- Parameters:
@@ -553,7 +554,7 @@
moe_expert_parallel_size (Optional[int]) – The expert parallel size for MoE models’s expert weights. Defaults to None.
enable_attention_dp (bool) – Enable attention data parallel. Defaults to False.
cp_config (Optional[dict]) – Context parallel config. Defaults to None.
-load_format (Literal['auto', 'dummy']) – The format to load the model. Defaults to auto.
+load_format (Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat]) – How to load the model weights. By default, detect the weight type from the model checkpoint. Defaults to 0.
enable_lora (bool) – Enable LoRA. Defaults to False.
lora_config (Optional[tensorrt_llm.lora_manager.LoraConfig]) – LoRA configuration for the model. Defaults to None.
enable_prompt_adapter (bool) – Enable prompt adapter. Defaults to False.
@@ -568,7 +569,7 @@
peft_cache_config (Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]) – PEFT cache config. Defaults to None.
scheduler_config (tensorrt_llm.llmapi.llm_args.SchedulerConfig) – Scheduler config. Defaults to None.
cache_transceiver_config (Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]) – Cache transceiver config. Defaults to None.
-speculative_config (Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig, tensorrt_llm.llmapi.llm_args.EagleDecodingConfig, tensorrt_llm.llmapi.llm_args.MTPDecodingConfig, tensorrt_llm.llmapi.llm_args.NGramDecodingConfig, NoneType]) – Speculative decoding config. Defaults to None.
+speculative_config (Union[tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig, tensorrt_llm.llmapi.llm_args.EagleDecodingConfig, tensorrt_llm.llmapi.llm_args.MTPDecodingConfig, tensorrt_llm.llmapi.llm_args.NGramDecodingConfig, tensorrt_llm.llmapi.llm_args.DraftTargetDecodingConfig, NoneType]) – Speculative decoding config. Defaults to None.
batching_type (Optional[tensorrt_llm.llmapi.llm_args.BatchingType]) – Batching type. Defaults to None.
normalize_log_probs (bool) – Normalize log probabilities. Defaults to False.
max_batch_size (Optional[int]) – The maximum batch size. Defaults to None.
@@ -580,16 +581,29 @@
num_postprocess_workers (int) – The number of processes used for postprocessing the generated tokens, including detokenization. Defaults to 0.
postprocess_tokenizer_dir (Optional[str]) – The path to the tokenizer directory for postprocessing. Defaults to None.
reasoning_parser (Optional[str]) – The parser to separate reasoning content from output. Defaults to None.
+garbage_collection_gen0_threshold (int) – Threshold for Python garbage collection of generation 0 objects.Lower values trigger more frequent garbage collection. Defaults to 20000.
backend (Optional[str]) – The backend to use for this LLM instance. Defaults to None.
-enable_tqdm (bool) – Enable tqdm for progress bar. Defaults to False.
-workspace (Optional[str]) – The workspace for the model. Defaults to None.
-enable_build_cache (Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]) – Enable build cache. Defaults to False.
-extended_runtime_perf_knob_config (Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]) – Extended runtime perf knob config. Defaults to None.
-calib_config (Optional[tensorrt_llm.llmapi.llm_args.CalibConfig]) – Calibration config. Defaults to None.
-embedding_parallel_mode (str) – The embedding parallel mode. Defaults to SHARDING_ALONG_VOCAB.
-fast_build (bool) – Enable fast build. Defaults to False.
build_config (Optional[tensorrt_llm.builder.BuildConfig]) – Build config. Defaults to None.
-kwargs (Any) – Advanced arguments passed to LlmArgs.
+use_cuda_graph (bool) – If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests only (the reason is that it’s hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory. Defaults to False.
+cuda_graph_batch_sizes (Optional[List[int]]) – List of batch sizes to create CUDA graphs for. Defaults to None.
+cuda_graph_max_batch_size (int) – Maximum batch size for CUDA graphs. Defaults to 0.
+cuda_graph_padding_enabled (bool) – If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance. Defaults to False.
+disable_overlap_scheduler (bool) – Disable the overlap scheduler. Defaults to False.
+moe_max_num_tokens (Optional[int]) – If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used. Defaults to None.
+moe_load_balancer (Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]) – Configuration for MoE load balancing. Defaults to None.
+attn_backend (str) – Attention backend to use. Defaults to TRTLLM.
+moe_backend (str) – MoE backend to use. Defaults to CUTLASS.
+mixed_sampler (bool) – If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc. Defaults to False.
+enable_trtllm_sampler (bool) – If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies. Defaults to False.
+kv_cache_dtype (str) – Data type for KV cache. Defaults to auto.
+enable_iter_perf_stats (bool) – Enable iteration performance statistics. Defaults to False.
+enable_iter_req_stats (bool) – If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats. Defaults to False.
+print_iter_log (bool) – Print iteration logs. Defaults to False.
+torch_compile_config (Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]) – Torch compile config. Defaults to None.
+autotuner_enabled (bool) – Enable autotuner only when torch compile is enabled. Defaults to True.
+enable_layerwise_nvtx_marker (bool) – If true, enable layerwise nvtx marker. Defaults to False.
+enable_min_latency (bool) – If true, enable min-latency mode. Currently only used for Llama4. Defaults to False.
+stream_interval (int) – The iteration interval to create responses under the streaming mode. Set this to a larger value when the batch size is large, which helps reduce the streaming overhead. Defaults to 1.
@@ -604,17 +618,6 @@
-
--
-workspace
-The directory to store intermediate files.
-
-- Type:
-pathlib.Path
-
-
-
-
-
llm_id
@@ -626,26 +629,6 @@
-
--
-__init__(
-
-
-- model: str | Path,
-- tokenizer: str | Path | TokenizerBase | PreTrainedTokenizerBase | None = None,
-- tokenizer_mode: Literal['auto', 'slow'] = 'auto',
-- skip_tokenizer_init: bool = False,
-- trust_remote_code: bool = False,
-- tensor_parallel_size: int = 1,
-- dtype: str = 'auto',
-- revision: str | None = None,
-- tokenizer_revision: str | None = None,
-- **kwargs: Any,
-
-
-) → None[source]
-
-
-
generate(
@@ -660,7 +643,7 @@
- disaggregated_params: DisaggregatedParams | Sequence[DisaggregatedParams] | None = None,
-
) → RequestOutput | List[RequestOutput][source]
+
) → RequestOutput | List[RequestOutput]
Generate output for the given prompts in the synchronous mode.
Synchronous generation accepts either single prompt or batched prompts.
@@ -701,7 +684,7 @@ A default one will be used if not provided.
- _postproc_params: PostprocParams | None = None,
-) → RequestOutput[source]
+) → RequestOutput
Generate output for the given prompt in the asynchronous mode.
Asynchronous generation accepts single prompt only.
@@ -734,7 +717,7 @@ A default one will be used if not provided.
- timeout: float | None = 2,
-) → List[dict][source]
+) → List[dict]
Get iteration KV events from the runtime.
- KV events are used to track changes and operations within the KV Cache. Types of events:
@@ -771,7 +754,7 @@ A default one will be used if not provided.
- timeout: float | None = 2,
-) → IterationResult[source]
+) → IterationResult
Get iteration KV events from the runtime.
- KV events are used to track changes and operations within the KV Cache. Types of events:
@@ -802,7 +785,7 @@ A default one will be used if not provided.
-
-get_stats(timeout: float | None = 2) → List[dict][source]
+get_stats(timeout: float | None = 2) → List[dict]
Get iteration statistics from the runtime.
To collect statistics, call this function after prompts have been submitted with LLM().generate().
@@ -830,7 +813,7 @@ To collect statistics, call this function after prompts have been submitted with
- timeout: float | None = 2,
-) → IterationResult[source]
+) → IterationResult
Get iteration statistics from the runtime.
To collect statistics, you can call this function in an async coroutine or the /metrics endpoint (if you’re using trtllm-serve)
after prompts have been submitted.
@@ -852,20 +835,9 @@ after prompts have been submitted.
property llm_id: str
-
--
-save(engine_dir: str) → None[source]
-Save the built engine to the given path.
-
-- Parameters:
-engine_dir (str) – The path to save the engine.
-
-
-
-
-
-shutdown() → None[source]
+shutdown() → None
@@ -873,11 +845,6 @@ after prompts have been submitted.
property tokenizer: TokenizerBase | None
-
--
-property workspace: Path
-
-
@@ -1018,8 +985,8 @@ after prompts have been submitted.
--
-property length: int
+-
+property length: int
@@ -1028,8 +995,8 @@ after prompts have been submitted.
--
-property logprobs_diff: List[float]
+-
+property logprobs_diff: List[float]
@@ -1048,8 +1015,8 @@ after prompts have been submitted.
--
-property text_diff: str
+-
+property text_diff: str
@@ -1058,8 +1025,8 @@ after prompts have been submitted.
--
-property token_ids_diff: List[int]
+-
+property token_ids_diff: List[int]
@@ -1141,8 +1108,8 @@ after prompts have been submitted.
--
-property prompt: str | None
+-
+property prompt: str | None
@@ -2003,7 +1970,7 @@ validated to form a valid model.
-
-validator validate_positive_values » max_ngram_size, max_window_size, max_verification_set_size[source]
+validator validate_positive_values » max_window_size, max_ngram_size, max_verification_set_size[source]
@@ -2065,7 +2032,7 @@ validated to form a valid model.
- dynamic_tree_max_topK: int | None = None,
- num_eagle_layers: int | None = None,
- max_non_leaves_per_layer: int | None = None,
-- pytorch_eagle_weights_path: str | None = None,
+- pytorch_weights_path: str | None = None,
- eagle3_one_model: bool | None = True,
@@ -2123,8 +2090,8 @@ validated to form a valid model.
--
-field pytorch_eagle_weights_path: str | None = None
+-
+field pytorch_weights_path: str | None = None
@@ -2146,6 +2113,7 @@ validated to form a valid model.
- use_relaxed_acceptance_for_thinking: bool | None = False,
- relaxed_topk: int | None = 1,
- relaxed_delta: float | None = 0.0,
+- use_mtp_vanilla: bool | None = False,
)[source]
@@ -2181,6 +2149,11 @@ validated to form a valid model.
field relaxed_topk: int | None = 1
+
+-
+field use_mtp_vanilla: bool | None = False
+
+
-
field use_relaxed_acceptance_for_thinking: bool | None = False
@@ -2986,19 +2959,19 @@ changed, you should remove the caches manually.
)[source]
+
+-
+property cache_root: Path
+
+
-
-property cache_root: Path
+property max_cache_storage_gb: float
-
-property max_cache_storage_gb: float
-
-
-
--
-property max_records: int
+property max_records: int
@@ -3321,51 +3294,87 @@ Whether to use a common pool for all requests, or the pool is private for each r
- *,
-- torch_compile_fullgraph: bool = True,
-- torch_compile_inductor_enabled: bool = False,
-- torch_compile_piecewise_cuda_graph: bool = False,
-- torch_compile_enable_userbuffers: bool = True,
+- enable_fullgraph: bool = True,
+- enable_inductor: bool = False,
+- enable_piecewise_cuda_graph: bool = False,
+- enable_userbuffers: bool = True,
)[source]
Bases: BaseModel
Configuration for torch.compile.
+
+-
+field enable_fullgraph: bool = True
+Enable full graph compilation in torch.compile.
+
+
+
+-
+field enable_inductor: bool = False
+Enable inductor backend in torch.compile.
+
+
+
+-
+field enable_piecewise_cuda_graph: bool = False
+Enable piecewise CUDA graph in torch.compile.
+
+
+
+-
+field enable_userbuffers: bool = True
+When torch compile is enabled, userbuffers is enabled by default.
+
+
-
model_config: ClassVar[ConfigDict] = {}
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-
--
-field torch_compile_enable_userbuffers: bool = True
-When torch compile is enabled, userbuffers is enabled by default.
+
+
+
+-
+class tensorrt_llm.llmapi.DraftTargetDecodingConfig(
+
+
+- *,
+- max_draft_len: int | None = None,
+- speculative_model: str | Path | None = None,
+- pytorch_weights_path: str | None = None,
+
+
+)[source]
+Bases: DecodingBaseConfig
+
+-
+decoding_type: ClassVar[str] = 'DraftTarget'
+
+
+
+-
+classmethod from_dict(data: dict)[source]
+
+
+
+-
+model_config: ClassVar[ConfigDict] = {}
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
--
-field torch_compile_fullgraph: bool = True
-Enable full graph compilation in torch.compile.
-
-
-
--
-field torch_compile_inductor_enabled: bool = False
-Enable inductor backend in torch.compile.
-
-
-
--
-field torch_compile_piecewise_cuda_graph: bool = False
-Enable piecewise CUDA graph in torch.compile.
-
+-
+field pytorch_weights_path: str | None = None
+
-
tensorrt_llm.llmapi.LlmArgs
-alias of TrtLlmArgs
+alias of TorchLlmArgs
@@ -3410,7 +3419,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
- peft_cache_config: ~tensorrt_llm.llmapi.llm_args.PeftCacheConfig | None = None,
- scheduler_config: ~tensorrt_llm.llmapi.llm_args.SchedulerConfig = <factory>,
- cache_transceiver_config: ~tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig | None = None,
-- speculative_config: ~tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig | ~tensorrt_llm.llmapi.llm_args.EagleDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MTPDecodingConfig | ~tensorrt_llm.llmapi.llm_args.NGramDecodingConfig | None = None,
+- speculative_config: ~tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig | ~tensorrt_llm.llmapi.llm_args.EagleDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MTPDecodingConfig | ~tensorrt_llm.llmapi.llm_args.NGramDecodingConfig | ~tensorrt_llm.llmapi.llm_args.DraftTargetDecodingConfig | None = None,
- batching_type: ~tensorrt_llm.llmapi.llm_args.BatchingType | None = None,
- normalize_log_probs: bool = False,
- max_batch_size: int | None = None,
@@ -3422,6 +3431,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
- num_postprocess_workers: int = 0,
- postprocess_tokenizer_dir: str | None = None,
- reasoning_parser: str | None = None,
+- garbage_collection_gen0_threshold: int = 20000,
- decoding_config: object | None = None,
- _mpi_session: object | None = None,
- backend: str | None = None,
@@ -3445,6 +3455,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
- autotuner_enabled: bool = True,
- enable_layerwise_nvtx_marker: bool = False,
- enable_min_latency: bool = False,
+- stream_interval: int = 1,
)[source]
@@ -3468,6 +3479,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3492,6 +3504,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3516,6 +3529,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3545,6 +3559,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3570,6 +3585,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3594,6 +3610,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3642,6 +3659,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3666,6 +3684,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3690,6 +3709,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3714,6 +3734,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3738,6 +3759,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3762,6 +3784,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3801,6 +3824,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3826,6 +3850,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -3836,20 +3861,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_cpu_loras: int
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -3860,20 +3885,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_lora_rank: int | None
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -3884,20 +3909,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_loras: int
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -3922,6 +3947,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+validate_stream_interval
@@ -3967,6 +3993,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+validate_stream_interval
@@ -3991,6 +4018,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+validate_stream_interval
@@ -4015,6 +4043,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -4039,6 +4068,32 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
+
+
+
+
+
+
+-
+field stream_interval: int = 1
+The iteration interval to create responses under the streaming mode. Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.
+
+- Validated by:
+
+init_build_config
+set_default_max_input_len
+set_runtime_knobs_from_build_config
+validate_and_init_tokenizer
+validate_build_config_remaining
+validate_build_config_with_runtime_params
+validate_cuda_graph_config
+validate_lora_config_consistency
+validate_model_format_misc
+validate_moe_load_balancer
+validate_parallel_config
+validate_speculative_config
+validate_stream_interval
@@ -4063,6 +4118,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+validate_stream_interval
@@ -4087,6 +4143,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
validate_moe_load_balancer
validate_parallel_config
validate_speculative_config
+
validate_stream_interval
@@ -4113,6 +4170,11 @@ Whether to use a common pool for all requests, or the pool is private for each r
validator validate_moe_load_balancer » all fields[source]
+
+-
+validator validate_stream_interval » all fields[source]
+
+
@@ -4158,7 +4220,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
- peft_cache_config: ~tensorrt_llm.llmapi.llm_args.PeftCacheConfig | None = None,
- scheduler_config: ~tensorrt_llm.llmapi.llm_args.SchedulerConfig = <factory>,
- cache_transceiver_config: ~tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig | None = None,
-- speculative_config: ~tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig | ~tensorrt_llm.llmapi.llm_args.EagleDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MTPDecodingConfig | ~tensorrt_llm.llmapi.llm_args.NGramDecodingConfig | None = None,
+- speculative_config: ~tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig | ~tensorrt_llm.llmapi.llm_args.EagleDecodingConfig | ~tensorrt_llm.llmapi.llm_args.MTPDecodingConfig | ~tensorrt_llm.llmapi.llm_args.NGramDecodingConfig | ~tensorrt_llm.llmapi.llm_args.DraftTargetDecodingConfig | None = None,
- batching_type: ~tensorrt_llm.llmapi.llm_args.BatchingType | None = None,
- normalize_log_probs: bool = False,
- max_batch_size: int | None = None,
@@ -4170,6 +4232,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
- num_postprocess_workers: int = 0,
- postprocess_tokenizer_dir: str | None = None,
- reasoning_parser: str | None = None,
+- garbage_collection_gen0_threshold: int = 20000,
- decoding_config: object | None = None,
- _mpi_session: object | None = None,
- backend: str | None = None,
@@ -4221,20 +4284,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
auto_parallel_world_size: int | None
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -4296,20 +4359,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
decoding_config: object | None
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -4450,20 +4513,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_cpu_loras: int
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -4474,20 +4537,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_lora_rank: int | None
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -4498,20 +4561,20 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_loras: int
Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.
--
-msg
+-
+msg
The deprecation message to be emitted.
--
-wrapped_property
+-
+wrapped_property
The property instance if the deprecated field is a computed field, or None.
--
-field_name
+-
+field_name
The name of the field being deprecated.
@@ -4633,9 +4696,7 @@ Whether to use a common pool for all requests, or the pool is private for each r
@@ -4945,9 +5005,9 @@ Whether to use a common pool for all requests, or the pool is private for each r
max_records
max_cache_storage_gb
__init__()
-cache_root
-max_cache_storage_gb
-max_records
+cache_root
+max_cache_storage_gb
+max_records
RequestError
@@ -5002,11 +5062,18 @@ Whether to use a common pool for all requests, or the pool is private for each r
TorchCompileConfig
+
+DraftTargetDecodingConfig
LlmArgs
@@ -5036,21 +5103,21 @@ Whether to use a common pool for all requests, or the pool is private for each r
kv_cache_dtype
load_format
max_cpu_loras
max_lora_rank
max_loras
mixed_sampler
@@ -5060,11 +5127,13 @@ Whether to use a common pool for all requests, or the pool is private for each r
moe_load_balancer
moe_max_num_tokens
print_iter_log
+stream_interval
torch_compile_config
use_cuda_graph
validate_cuda_graph_config
validate_cuda_graph_max_batch_size
validate_moe_load_balancer
+validate_stream_interval
TrtLlmArgs
@@ -5076,17 +5145,17 @@ Whether to use a common pool for all requests, or the pool is private for each r
auto_parallel_config
auto_parallel_world_size
build_config
calib_config
decoding_config
embedding_parallel_mode
@@ -5096,21 +5165,21 @@ Whether to use a common pool for all requests, or the pool is private for each r
fast_build
init_calib_config
max_cpu_loras
max_lora_rank
max_loras
model_config
@@ -5215,9 +5284,9 @@ Whether to use a common pool for all requests, or the pool is private for each r
diff --git a/latest/objects.inv b/latest/objects.inv
index e73d4dd4f389854300eee1fb8e8a0ed0a27c4e6f..846097df7a7b8ffd03fb11e5e251034bcc12920f 100644
GIT binary patch
delta 136928
zcmXtfV{|567iFA|Z9D0(W81cqj_o`#p4fKB=-9Sx+qTV~yffeYsXF(bRhPBa!QQ9k
z`N#F=k2t{}zZjSpn4FB6YNcNAfM^4g&DR>Kj@jT5Obk5vR<+=MEN3~}N5{?0$M+&>
ztdou7Yv5eZqRoE#B)IU_o@cyYzkYnakPBXV+|OO7Ls)3f?Y3wafeV8V+20aGx2MCf
zaDHKQN^YENo4oexq)lhh7V6>zZE+{!1FDjoOqs7uKWV$v2d{NUPsY@|fq!iHs3N9)
z6bTBlE*~1?y;+5EKeYrGNdqZ6YPFr%l7XOck~jfUy^6%+Bt4sXA6$d>3=-xUYoO|WKk
z&!*PZ$_U`u&(K?CLlP1TEj~mW^zPB$vzP|Sl?q71r!=V_R0qiqQc!0>%2s=qnp2YOy
zbwSzjtA9h*aIs^uD;m{^_GdraWeAF8;;jDIBs;jlxO#sm6yE-_>ddDERWv^qUpdWC
z?WIn(|KW6a)o(CA1;h_WM0n~@E(_&mNjx-jH2UzMzb0euxe|
z1$Q05GWmUVh+|C1<)0tDpV{q`XOoQs61SsXZV^n88v|li@z|^}n0mwQbLIHkwfBUq
zJM#%aCrUz7vW&*70^yJisa$LWMs^ii5Q^lo>CaI5Kg4w*f%*hVY~guR6LNwnuolB?
zrba#M6nBA76*zEc<3d;6v!2B~WxwKaJ4*IFflkZ_^}GE$V&8luILAv64k$(zb~>OmvT
z*E8`Rc^5{vc9J)!K`wt4%pqwQM+G?%*kxPZjjS`mEg5*{9xbwjoOXDnvDqa;g#P4F
z-24x;0-(G^O)?He+WFIBOXtT}xUhHF3qwB^7HQH7F}mEcz+7>$H|nEkYO0w97A2t{;_S0XrrjA0y
zR`r8590x^T@3&n)LXmHYDAg^nDuqfUr(L`537FIZCDW8HcZ)+l#ECW`onX#Y`ej^6
zm$sPzt8a8l8eB$W|DIuA^>zoAhCm$<*hywV@bcrz1QX9g&Dt&BxcR@iq
zug*<7UwJAXs85vJ_(yrbNtv}kibIH%oBM84q$CTnx=0~K;e4jMENtOz`qtrjbkrnj
z3%He&Ak%!WK>2&Z&m^H)7~zM)0USH)^avX|)p7PqNIP{egyEf|A2xY^MA_Ty{b2cZ
zt9}Q;P*de)|MVPX;XK(Es;aK#Ou82b<&369^dnpsS3U!#l-e!n>`va}nEGQCL22*hQ*)
zxze6Z<6hrTtFl4Z#@%r5dbNPEf9=E^T*kout-W)vPx1qlqxN6hmTnRiKDXOg>wFAA
z7%>@Ws8QuJeqVW0;7sYq(p!!D@1u&YnI)m!0&jzPGCAyl5i>4vP%dn3HNZTf1xQb}
z2NI)nJ?$3hl0zcz!5YStPdH2AvY7F
zM({Oup@jZM{PdS)N6{Y4;xj=oP>lsal)Gr0POag1Oh!vAbTNDOr@oH*Tl?DYrU*-q
z5^HqP-(IlMxEi-SrgHmS;qVGZ))~@Kva%T6mQgtbh^^#|M?;AFh3UuUy+GzjTu}Q@
zN1|a5i$719Hee&LGZ;mCeRP>ppJy47-p7s_nZ=r!r-1{bbRrro0wPMTmQT*;W*tPk
z<41?VFU?2OD+qdq**1n8OBm|~q6UoB0I1S*|`Q&-#he=R_d@U+xs
zb;PCQv(uEa!!C;GLj*^SU2BbmVwD8O6kEJ0?vEEM6fYd_M?Ukd1Y3XDI)RnyWrHT{
zIc=Bd$vp2G(p|B4(uf;^84NftTs4hzfpD7&DayeiEX5Wvc-9f#I0DBYnn<6*t~Ia
zJ|Kz{+2U#g;L^75Ktms{Dw$`}1Cs^GRN46K6amyd=IhO2P%tb;R>A
zAuVtuZRZTcOEFL9sZiLb1@Xl|bDmRP$wzklHiPj=4(PS6CcGwhD%-b~8X8;n3Iz!@
z0(^J>kCm)On!jbt*%VsIb~_Nv!k%CLE^m24e`+)Mlb|#@fXApCl&!58OWB}eDCRS&
z-BnJ`>SLNxrmBEQHGFGSBL?UmmS_1RLwHz(RXe_#NO&HJ#~$3HJt3{o9&q$n=-NaE
z!ak`p44k(2{Ic)vy|BfsiTS|khFkmaq6;j@>DS1UsFq)HU$LI|ur+NTOWH0|KF{AO
ztMr5~clG!HkusMcHd>1aO0`7`-f5)cQqWMNVZHq`Zkgi|IE_|dHrW@VXx^}swM~15
z8k~h9uzsUn>#LL7BK{yj2ao!E47-}C2hP|+0z9mLAtnZHAwSo1=l+5tJD%8TP;0f|
z^R-F037Y2VkKiUbXud$^=i4@B>x|YGOJ8r+xi41sjxJMz-*i$dP+^V8TdAbm7`W0TPmRY;93F+OGm&zEMf
z5@;tZJQrdio3>d-_6#dpO+9U}P<~!S;4*nN^4r0jQigafx55wYViryg56)Iqtrszs
z*YCa^V`?(4C-5<1Z#8dE1%;dR*@_!P}K?_#<3$kv8JkuP(rk?lu?pTmSL$7oAX5
ziI+$9jeXA*J~WPa40fg0$e5|WlRO6$SiaPT=0?_Kn9_6}fF``mczoQ=oNCY3DQ_L?n_%pi&z_&b@c)vdO7L;SK?;?2~
zkhOIDo}+}af#iTXyHQmsj_aDf)Nj8~lvbw_fMd2oA7%2BsIBkTNY?V-?vTc~
zg7MVv0qxEJeu~Pi>;oF}uQR7fgI?CuPIF$}e2@fSgpkl6Qxe8Eq=2}E6hZs#*&;Q4<32mr|8J9$NqDv%;lDdRwz{vwU+f2jFZHNLaq6p&vj(GCw+McN<1cPA0
zVmnUy0#72`r@OtY{og*Kzj0mi|q4D>EDXw3C`nHQ5+ZPJ>Wz~PXrfZI01q?hy--R%dwI<^@n@S2Hf?@
zX@UI1WydD)cyIHO)TEAGN@;P;tvG+GbLK&SbX%No2!yYI#%LIj{z0=k%)m0&fbAW|
zCIALzmDm=e-Vsfp9n%~IwR7c(24*s0C`KOY${hoEsz6;tfop!@=!!OH7095v%iiLj
zFEAJtc-e&4C;6So=@a{xQQd`sro|*9!I!UiU{V~YqlaRjZS|x;)3&BlqOPfp-9KA|
zV5Rm4f`M-U#NP|x;yr@MhE2Y{qD>xUZ3QnhVuUQHB7f#7`fEHG2_E;koceGp!1J1s
zG$(ZQ1=Xg3qg3Y5*y14JYCv(Oq-8pI*e{=`v3_UqVf43D&l4Hxv
zBJ|sc530Vv^{K3NFQWN`b$y7sik7OmIEqKfsV#nhCv1$Y7Hp?IR-
zEkwc5CK7HmS_LyX+8;rkaNQqISOtVgZsNQ<6@MOvRPS#ysEZ}lEe
zMz(5e(Fsk57~JH=Ml;cd*I$$Fl-Yu`wXHFoRN6pNGbxZQAGMaJfaV*Y`*}s(pGLfW
z<;?bufLredsPAHk!U#PX0>^HFSG=cx!W_vnF754&c{v8dhs2Vn*du^@ZkV%lO^LgZ>`n;rKTcX+>ioO0s8l7h9
z>^xS{v1++xA^&1jEZPo}nKL)?^*4%^jlz9Ap0+)RdSPrPw?f9s8J5qmQG$j|?d
z5W@%>3f24|3u%p@ZO2aHlGc*)gRy|M*%7~xWwhN9LB2(E}q_3XI+(n
z;8R_63FHKkW!QCAEV~=jLpC6GpvAW3Hm*`@S0CwMti!QdGIW`$Lf&T5vLa!A$QjH<
zb!mSyL3@Z)X-ZF6e*_|7hhW>Tsoj`G8N4)ycmlr@*BN{;s>n+_df$VTXFh<5;`H&s
z7bO&Z}5S
zCbNhE`NQ^t+B#ru6-jp8#Pe!`;5nVJ#ZxDx+#wrciuw+fvf4S2-V}kBl&fLh@xY$A
zM6!~sPS*_K(b?4Lw2*z16*j-y2i7_V>xl`MS$?xAX;gw`k1-6c)u;5{^)OI_QBuIK
zGr!Xr@Cd$sKlw*NUlQ2j-r4Xmwdh{uI+1fM<4l0s4QB;Vn4Z9Z$rt?`e8RLpJt??v
z;g0=Bcz8Xv?fw-bo`|*ewomd9ehQIQti+d96su$3T00v3D%7Tqy1X5NXXBlaEq!K$
zx4i6_e?_X3+-#mv17{787$zl{zv5oU-kNH*XO$0@+^ABYO#xoGlvrDDiwc5%bvm+x
z#;D<`VK^FZHAD@93Q@VJZIm=m8fyv8Im#veIO)nGHt0aaW0MYC7FYFK`GoS>m~orz
zN8PERwnhcDy**j*kJ&`15l+ar`dnE7!98!;~
zq?2Xk;!AI1cD>MUYU>Mn=3s7REL_ns+H^9)T~0g7ApkkZE0N%GHfT0A^k<~EG5A|k
z+beb5t#BXRpF4G}5kH+=5Po@_wLX<%lYYQ0g0yrUzN)`~-e2ffQ&%+-`%G(v#^Sj>
zTe0^{zPXfo!$<4habPS#syNmUca0S@NC=GV&3chl-LgDPx?}zA(3<7QFYAcOF
zBkGQbOq)m1xh*X(_-tQ+(DPdB9$aEMNmqVl*}ykO+dTWv!jn&M@O9X}a)O$fZIjXZ
zJ4<_PF~ZvdB1WHkis(T