From 1b7ccbecd70e3a69113128a9d0abcbd554b91d2f Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Tue, 8 Jul 2025 02:03:18 +0000 Subject: [PATCH] Update latest GitHub pages to v1.0.0rc2 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 6590 +++--- latest/_cpp_gen/runtime.html | 18347 ++++++++-------- .../attention.py | 24 +- .../llm_args.py | 251 +- latest/_modules/index.html | 71 +- latest/_modules/tensorrt_llm/builder.html | 71 +- .../tensorrt_llm/disaggregated_params.html | 71 +- .../tensorrt_llm/executor/result.html | 74 +- .../_modules/tensorrt_llm/executor/utils.html | 71 +- latest/_modules/tensorrt_llm/functional.html | 71 +- .../tensorrt_llm/layers/activation.html | 71 +- .../tensorrt_llm/layers/attention.html | 71 +- latest/_modules/tensorrt_llm/layers/cast.html | 71 +- latest/_modules/tensorrt_llm/layers/conv.html | 71 +- .../tensorrt_llm/layers/embedding.html | 71 +- .../_modules/tensorrt_llm/layers/linear.html | 71 +- latest/_modules/tensorrt_llm/layers/mlp.html | 71 +- .../tensorrt_llm/layers/normalization.html | 71 +- .../_modules/tensorrt_llm/layers/pooling.html | 71 +- .../tensorrt_llm/llmapi/build_cache.html | 71 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 76 +- .../tensorrt_llm/llmapi/llm_args.html | 345 +- .../tensorrt_llm/llmapi/mpi_session.html | 71 +- .../tensorrt_llm/models/baichuan/model.html | 71 +- .../tensorrt_llm/models/bert/model.html | 71 +- .../tensorrt_llm/models/bloom/model.html | 71 +- .../tensorrt_llm/models/chatglm/config.html | 71 +- .../tensorrt_llm/models/chatglm/model.html | 71 +- .../tensorrt_llm/models/clip/model.html | 71 +- .../tensorrt_llm/models/cogvlm/config.html | 71 +- .../tensorrt_llm/models/cogvlm/model.html | 71 +- .../tensorrt_llm/models/commandr/model.html | 71 +- .../tensorrt_llm/models/dbrx/config.html | 71 +- .../tensorrt_llm/models/dbrx/model.html | 71 +- .../models/deepseek_v1/model.html | 71 +- .../models/deepseek_v2/model.html | 71 +- .../tensorrt_llm/models/dit/model.html | 71 +- .../tensorrt_llm/models/eagle/model.html | 71 +- .../tensorrt_llm/models/enc_dec/model.html | 71 +- .../tensorrt_llm/models/falcon/config.html | 71 +- .../tensorrt_llm/models/falcon/model.html | 71 +- .../tensorrt_llm/models/gemma/config.html | 71 +- .../tensorrt_llm/models/gemma/model.html | 71 +- .../tensorrt_llm/models/gpt/config.html | 71 +- .../tensorrt_llm/models/gpt/model.html | 71 +- .../tensorrt_llm/models/gptj/config.html | 71 +- .../tensorrt_llm/models/gptj/model.html | 71 +- .../tensorrt_llm/models/gptneox/model.html | 71 +- .../tensorrt_llm/models/llama/config.html | 71 +- .../tensorrt_llm/models/llama/model.html | 71 +- .../tensorrt_llm/models/mamba/model.html | 71 +- .../tensorrt_llm/models/medusa/config.html | 71 +- .../tensorrt_llm/models/medusa/model.html | 71 +- .../tensorrt_llm/models/mllama/model.html | 71 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 71 +- .../tensorrt_llm/models/modeling_utils.html | 73 +- .../tensorrt_llm/models/mpt/model.html | 71 +- .../models/multimodal_encoders/config.html | 71 +- .../models/multimodal_encoders/model.html | 71 +- .../tensorrt_llm/models/opt/model.html | 71 +- .../tensorrt_llm/models/phi/model.html | 71 +- .../tensorrt_llm/models/phi3/model.html | 71 +- .../models/recurrentgemma/model.html | 71 +- .../tensorrt_llm/models/redrafter/model.html | 71 +- .../_modules/tensorrt_llm/plugin/plugin.html | 71 +- .../tensorrt_llm/quantization/mode.html | 71 +- .../quantization/quantize_by_modelopt.html | 71 +- .../runtime/enc_dec_model_runner.html | 71 +- .../tensorrt_llm/runtime/generation.html | 71 +- .../runtime/kv_cache_manager.html | 71 +- .../tensorrt_llm/runtime/model_runner.html | 71 +- .../runtime/model_runner_cpp.html | 71 +- .../runtime/multimodal_model_runner.html | 93 +- .../tensorrt_llm/runtime/session.html | 71 +- .../tensorrt_llm/sampling_params.html | 71 +- latest/_sources/_cpp_gen/executor.rst.txt | 42 +- latest/_sources/_cpp_gen/runtime.rst.txt | 360 +- ...tice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt | 90 +- ...MTP_Implementation_and_Optimization.md.txt | 4 +- ..._Expert_Parallelism_in_TensorRT-LLM.md.txt | 4 +- .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../deepseek_r1_reasoning_parser.rst.txt | 2 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- latest/_sources/examples/index.rst.txt | 19 +- .../examples/llm_api_examples.rst.txt | 39 +- .../examples/llm_auto_parallel.rst.txt | 8 - .../examples/llm_eagle2_decoding.rst.txt | 8 - .../examples/llm_eagle_decoding.rst.txt | 8 - .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 4 +- .../llm_inference_async_streaming.rst.txt | 4 +- .../examples/llm_inference_customize.rst.txt | 8 - .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_inference_kv_events.rst.txt | 8 - .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_lookahead_decoding.rst.txt | 8 - .../examples/llm_medusa_decoding.rst.txt | 8 - .../examples/llm_mgmn_llm_distributed.rst.txt | 6 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 6 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 6 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- .../examples/llm_quantization.rst.txt | 8 - .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 6 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 10 + .../examples/trtllm_serve_examples.rst.txt | 6 +- latest/_sources/index.rst.txt | 3 +- .../build-from-source-linux.md.txt | 23 +- .../_sources/installation/containers.md.txt | 10 + .../_sources/installation/grace-hopper.md.txt | 20 - latest/_sources/installation/linux.md.txt | 47 +- latest/_sources/llm-api/index.md.txt | 74 +- latest/_sources/llm-api/reference.rst.txt | 6 + .../_sources/performance/perf-overview.md.txt | 36 +- latest/_sources/quick-start-guide.md.txt | 31 +- .../_sources/reference/dev-containers.md.txt | 100 + latest/_sources/torch.md.txt | 45 +- .../feature_combination_matrix.md.txt | 18 + .../torch/features/overlap_scheduler.md.txt | 24 + .../torch/features/quantization.md.txt | 18 + .../_sources/torch/features/sampling.md.txt | 20 + latest/_static/togglebutton.css | 160 + latest/_static/togglebutton.js | 187 + latest/advanced/disaggregated-service.html | 71 +- latest/advanced/executor.html | 71 +- latest/advanced/expert-parallelism.html | 71 +- latest/advanced/gpt-attention.html | 71 +- latest/advanced/gpt-runtime.html | 71 +- latest/advanced/graph-rewriting.html | 71 +- latest/advanced/kv-cache-management.html | 71 +- latest/advanced/kv-cache-reuse.html | 71 +- latest/advanced/lora.html | 71 +- .../advanced/lowprecision-pcie-allreduce.html | 71 +- .../open-sourced-cutlass-kernels.html | 71 +- latest/advanced/speculative-decoding.html | 71 +- latest/advanced/weight-streaming.html | 71 +- latest/architecture/add-model.html | 71 +- latest/architecture/checkpoint.html | 71 +- latest/architecture/core-concepts.html | 71 +- latest/architecture/model-weights-loader.html | 71 +- latest/architecture/overview.html | 71 +- latest/architecture/workflow.html | 71 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 165 +- latest/blogs/Falcon180B-H200.html | 71 +- latest/blogs/H100vsA100.html | 77 +- latest/blogs/H200launch.html | 71 +- latest/blogs/XQA-kernel.html | 71 +- latest/blogs/quantization-in-TRT-LLM.html | 71 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 71 +- ...1_MTP_Implementation_and_Optimization.html | 75 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 71 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 75 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 71 +- latest/commands/trtllm-build.html | 71 +- latest/commands/trtllm-serve.html | 75 +- .../build-image-to-dockerhub.html | 71 +- latest/dev-on-cloud/dev-on-runpod.html | 71 +- latest/examples/curl_chat_client.html | 71 +- .../curl_chat_client_for_multimodal.html | 71 +- latest/examples/curl_completion_client.html | 71 +- latest/examples/customization.html | 77 +- .../deepseek_r1_reasoning_parser.html | 71 +- latest/examples/genai_perf_client.html | 71 +- .../genai_perf_client_for_multimodal.html | 71 +- latest/examples/index.html | 98 +- latest/examples/llm_api_examples.html | 145 +- latest/examples/llm_eagle2_decoding.html | 718 - latest/examples/llm_eagle_decoding.html | 723 - latest/examples/llm_guided_decoding.html | 174 +- latest/examples/llm_inference.html | 154 +- latest/examples/llm_inference_async.html | 175 +- .../llm_inference_async_streaming.html | 213 +- latest/examples/llm_inference_customize.html | 719 - .../examples/llm_inference_distributed.html | 173 +- latest/examples/llm_inference_kv_events.html | 711 - latest/examples/llm_logits_processor.html | 315 +- latest/examples/llm_medusa_decoding.html | 756 - latest/examples/llm_mgmn_llm_distributed.html | 176 +- latest/examples/llm_mgmn_trtllm_bench.html | 254 +- latest/examples/llm_mgmn_trtllm_serve.html | 180 +- latest/examples/llm_multilora.html | 194 +- latest/examples/llm_quantization.html | 744 - latest/examples/openai_chat_client.html | 116 +- .../openai_chat_client_for_multimodal.html | 306 +- latest/examples/openai_completion_client.html | 110 +- ...=> openai_completion_client_for_lora.html} | 166 +- latest/examples/trtllm_serve_examples.html | 81 +- latest/genindex.html | 163 +- latest/index.html | 85 +- .../installation/build-from-source-linux.html | 112 +- .../{grace-hopper.html => containers.html} | 149 +- latest/installation/linux.html | 118 +- latest/key-features.html | 71 +- latest/llm-api/index.html | 171 +- latest/llm-api/reference.html | 371 +- latest/objects.inv | Bin 147331 -> 147647 bytes latest/overview.html | 71 +- latest/performance/perf-analysis.html | 71 +- latest/performance/perf-benchmarking.html | 71 +- latest/performance/perf-overview.html | 107 +- .../benchmarking-default-performance.html | 71 +- .../deciding-model-sharding-strategy.html | 71 +- .../fp8-quantization.html | 71 +- .../performance-tuning-guide/index.html | 71 +- ...ing-max-batch-size-and-max-num-tokens.html | 71 +- .../useful-build-time-flags.html | 71 +- .../useful-runtime-flags.html | 71 +- latest/py-modindex.html | 71 +- .../python-api/tensorrt_llm.functional.html | 71 +- latest/python-api/tensorrt_llm.layers.html | 77 +- latest/python-api/tensorrt_llm.models.html | 71 +- latest/python-api/tensorrt_llm.plugin.html | 71 +- .../python-api/tensorrt_llm.quantization.html | 71 +- latest/python-api/tensorrt_llm.runtime.html | 71 +- latest/quick-start-guide.html | 110 +- latest/reference/ci-overview.html | 77 +- .../dev-containers.html} | 286 +- latest/reference/memory.html | 71 +- latest/reference/precision.html | 71 +- latest/reference/support-matrix.html | 71 +- latest/reference/troubleshooting.html | 71 +- latest/release-notes.html | 77 +- latest/scripts/disaggregated/README.html | 71 +- latest/search.html | 71 +- latest/searchindex.js | 2 +- latest/torch.html | 117 +- latest/torch/adding_new_model.html | 71 +- latest/torch/arch_overview.html | 71 +- latest/torch/attention.html | 71 +- .../features/feature_combination_matrix.html | 866 + latest/torch/features/overlap_scheduler.html | 666 + latest/torch/features/quantization.html | 632 + latest/torch/features/sampling.html | 634 + latest/torch/kv_cache_manager.html | 71 +- latest/torch/scheduler.html | 71 +- 241 files changed, 22753 insertions(+), 26564 deletions(-) delete mode 100644 latest/_sources/examples/llm_auto_parallel.rst.txt delete mode 100644 latest/_sources/examples/llm_eagle2_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_eagle_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_inference_customize.rst.txt delete mode 100644 latest/_sources/examples/llm_inference_kv_events.rst.txt delete mode 100644 latest/_sources/examples/llm_lookahead_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_medusa_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_quantization.rst.txt create mode 100644 latest/_sources/examples/openai_completion_client_for_lora.rst.txt create mode 100644 latest/_sources/installation/containers.md.txt delete mode 100644 latest/_sources/installation/grace-hopper.md.txt create mode 100644 latest/_sources/reference/dev-containers.md.txt create mode 100644 latest/_sources/torch/features/feature_combination_matrix.md.txt create mode 100644 latest/_sources/torch/features/overlap_scheduler.md.txt create mode 100644 latest/_sources/torch/features/quantization.md.txt create mode 100644 latest/_sources/torch/features/sampling.md.txt create mode 100644 latest/_static/togglebutton.css create mode 100644 latest/_static/togglebutton.js delete mode 100644 latest/examples/llm_eagle2_decoding.html delete mode 100644 latest/examples/llm_eagle_decoding.html delete mode 100644 latest/examples/llm_inference_customize.html delete mode 100644 latest/examples/llm_inference_kv_events.html delete mode 100644 latest/examples/llm_medusa_decoding.html delete mode 100644 latest/examples/llm_quantization.html rename latest/examples/{llm_auto_parallel.html => openai_completion_client_for_lora.html} (70%) rename latest/installation/{grace-hopper.html => containers.html} (71%) rename latest/{examples/llm_lookahead_decoding.html => reference/dev-containers.html} (59%) create mode 100644 latest/torch/features/feature_combination_matrix.html create mode 100644 latest/torch/features/overlap_scheduler.html create mode 100644 latest/torch/features/quantization.html create mode 100644 latest/torch/features/sampling.html diff --git a/latest/.buildinfo b/latest/.buildinfo index fff48eff5a..7eaa80657f 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 812228e223c943ca4d4a375a1c33a00f +config: cb3cbe8a473ef8fd1cf27e6890eb63f4 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index aeaf7f62bd..5dbf38d7a5 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -35,6 +35,7 @@ + @@ -47,11 +48,17 @@ + + + + + + @@ -63,7 +70,7 @@ - + @@ -318,58 +325,32 @@

Installation

LLM API

Examples

Blogs

Installation

LLM API

Examples

Blogs

Installation

LLM API

Examples

Blogs

@@ -511,72 +494,70 @@
-

Generate Text in Streaming#

+

Generate text in streaming#

Source NVIDIA/TensorRT-LLM.

-
 1### Generate Text in Streaming
- 2import asyncio
- 3
- 4from tensorrt_llm import SamplingParams
- 5from tensorrt_llm._tensorrt_engine import LLM
- 6
+
 1import asyncio
+ 2
+ 3from tensorrt_llm import LLM, SamplingParams
+ 4
+ 5
+ 6def main():
  7
- 8def main():
- 9
-10    # model could accept HF model name or a path to local HF model.
-11    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-12
-13    # Sample prompts.
-14    prompts = [
-15        "Hello, my name is",
-16        "The president of the United States is",
-17        "The capital of France is",
-18        "The future of AI is",
-19    ]
-20
-21    # Create a sampling params.
-22    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-23
-24    # Async based on Python coroutines
-25    async def task(id: int, prompt: str):
-26
-27        # streaming=True is used to enable streaming generation.
-28        async for output in llm.generate_async(prompt,
-29                                               sampling_params,
-30                                               streaming=True):
-31            print(f"Generation for prompt-{id}: {output.outputs[0].text!r}")
-32
-33    async def main():
-34        tasks = [task(id, prompt) for id, prompt in enumerate(prompts)]
-35        await asyncio.gather(*tasks)
+ 8    # model could accept HF model name or a path to local HF model.
+ 9    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+10
+11    # Sample prompts.
+12    prompts = [
+13        "Hello, my name is",
+14        "The president of the United States is",
+15        "The capital of France is",
+16        "The future of AI is",
+17    ]
+18
+19    # Create a sampling params.
+20    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+21
+22    # Async based on Python coroutines
+23    async def task(id: int, prompt: str):
+24
+25        # streaming=True is used to enable streaming generation.
+26        async for output in llm.generate_async(prompt,
+27                                               sampling_params,
+28                                               streaming=True):
+29            print(f"Generation for prompt-{id}: {output.outputs[0].text!r}")
+30
+31    async def main():
+32        tasks = [task(id, prompt) for id, prompt in enumerate(prompts)]
+33        await asyncio.gather(*tasks)
+34
+35    asyncio.run(main())
 36
-37    asyncio.run(main())
-38
-39    # Got output like follows:
-40    # Generation for prompt-0: '\n'
-41    # Generation for prompt-3: 'an'
-42    # Generation for prompt-2: 'Paris'
-43    # Generation for prompt-1: 'likely'
-44    # Generation for prompt-0: '\n\n'
-45    # Generation for prompt-3: 'an exc'
-46    # Generation for prompt-2: 'Paris.'
-47    # Generation for prompt-1: 'likely to'
-48    # Generation for prompt-0: '\n\nJ'
-49    # Generation for prompt-3: 'an exciting'
-50    # Generation for prompt-2: 'Paris.'
-51    # Generation for prompt-1: 'likely to nomin'
-52    # Generation for prompt-0: '\n\nJane'
-53    # Generation for prompt-3: 'an exciting time'
-54    # Generation for prompt-1: 'likely to nominate'
-55    # Generation for prompt-0: '\n\nJane Smith'
-56    # Generation for prompt-3: 'an exciting time for'
-57    # Generation for prompt-1: 'likely to nominate a'
-58    # Generation for prompt-0: '\n\nJane Smith.'
-59    # Generation for prompt-3: 'an exciting time for us'
-60    # Generation for prompt-1: 'likely to nominate a new'
-61
-62
-63if __name__ == '__main__':
-64    main()
+37    # Got output like follows:
+38    # Generation for prompt-0: '\n'
+39    # Generation for prompt-3: 'an'
+40    # Generation for prompt-2: 'Paris'
+41    # Generation for prompt-1: 'likely'
+42    # Generation for prompt-0: '\n\n'
+43    # Generation for prompt-3: 'an exc'
+44    # Generation for prompt-2: 'Paris.'
+45    # Generation for prompt-1: 'likely to'
+46    # Generation for prompt-0: '\n\nJ'
+47    # Generation for prompt-3: 'an exciting'
+48    # Generation for prompt-2: 'Paris.'
+49    # Generation for prompt-1: 'likely to nomin'
+50    # Generation for prompt-0: '\n\nJane'
+51    # Generation for prompt-3: 'an exciting time'
+52    # Generation for prompt-1: 'likely to nominate'
+53    # Generation for prompt-0: '\n\nJane Smith'
+54    # Generation for prompt-3: 'an exciting time for'
+55    # Generation for prompt-1: 'likely to nominate a'
+56    # Generation for prompt-0: '\n\nJane Smith.'
+57    # Generation for prompt-3: 'an exciting time for us'
+58    # Generation for prompt-1: 'likely to nominate a new'
+59
+60
+61if __name__ == '__main__':
+62    main()
 
@@ -592,20 +573,20 @@

previous

-

Generation with Quantization

+

Generate text asynchronously

next

-

Generate text with guided decoding

+

Distributed LLM Generation

@@ -709,9 +690,9 @@ diff --git a/latest/examples/llm_inference_customize.html b/latest/examples/llm_inference_customize.html deleted file mode 100644 index e6f1d43927..0000000000 --- a/latest/examples/llm_inference_customize.html +++ /dev/null @@ -1,719 +0,0 @@ - - - - - - - - - - - - Generate text with customization — TensorRT-LLM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - - - - - - - - - -
- -
- - -
-
- -
-
- -
- -
- - -
- -
- - -
-
- - - - - -
- -
-

Generate text with customization#

-

Source NVIDIA/TensorRT-LLM.

-
 1### Generate text with customization
- 2import tempfile
- 3
- 4from tensorrt_llm._tensorrt_engine import LLM
- 5from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
- 6
- 7
- 8def main():
- 9    # The end user can customize the build configuration with the build_config class and other arguments borrowed from the lower-level APIs
-10    build_config = BuildConfig()
-11    build_config.max_batch_size = 128
-12    build_config.max_num_tokens = 2048
-13
-14    build_config.max_beam_width = 4
-15
-16    # Model could accept HF model name or a path to local HF model.
-17
-18    llm = LLM(
-19        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-20        build_config=build_config,
-21        kv_cache_config=KvCacheConfig(
-22            free_gpu_memory_fraction=0.8
-23        ),  # Similar to `build_config`, you can also customize the runtime configuration with the `kv_cache_config`, `runtime_config`, `peft_cache_config` or \
-24        # other arguments borrowed from the lower-level APIs.
-25    )
-26
-27    # You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine.
-28    llm.save(tempfile.mkdtemp())
-29
-30    # Sample prompts.
-31    prompts = [
-32        "Hello, my name is",
-33        "The president of the United States is",
-34        "The capital of France is",
-35        "The future of AI is",
-36    ]
-37
-38    # With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on.
-39    sampling_params = SamplingParams(temperature=0.8,
-40                                     top_p=0.95,
-41                                     n=4,
-42                                     use_beam_search=True)
-43
-44    for output in llm.generate(prompts, sampling_params):
-45        print(
-46            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-47        )
-48
-49    # Got output like
-50    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-51    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-52    # Prompt: 'The capital of France is', Generated text: 'Paris.'
-53    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-54
-55
-56if __name__ == '__main__':
-57    main()
-
-
-
- - -
- - - - - - - -
- - - -
- - - - - -
-
- -
- -
-
-
- - - - - - - - \ No newline at end of file diff --git a/latest/examples/llm_inference_distributed.html b/latest/examples/llm_inference_distributed.html index c7706a797c..471ac80d2e 100644 --- a/latest/examples/llm_inference_distributed.html +++ b/latest/examples/llm_inference_distributed.html @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -513,51 +496,49 @@

Distributed LLM Generation#

Source NVIDIA/TensorRT-LLM.

-
 1### Distributed LLM Generation
- 2from tensorrt_llm import SamplingParams
- 3from tensorrt_llm._tensorrt_engine import LLM
- 4
- 5
- 6def main():
- 7    # model could accept HF model name or a path to local HF model.
- 8    llm = LLM(
- 9        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-10        # Enable 2-way tensor parallelism
-11        tensor_parallel_size=2
-12        # Enable 2-way pipeline parallelism if needed
-13        # pipeline_parallel_size=2
-14        # Enable 2-way expert parallelism for MoE model's expert weights
-15        # moe_expert_parallel_size=2
-16        # Enable 2-way tensor parallelism for MoE model's expert weights
-17        # moe_tensor_parallel_size=2
-18    )
-19
-20    # Sample prompts.
-21    prompts = [
-22        "Hello, my name is",
-23        "The president of the United States is",
-24        "The capital of France is",
-25        "The future of AI is",
-26    ]
-27
-28    # Create a sampling params.
-29    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-30
-31    for output in llm.generate(prompts, sampling_params):
-32        print(
-33            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-34        )
-35
-36    # Got output like
-37    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-38    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-39    # Prompt: 'The capital of France is', Generated text: 'Paris.'
-40    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
-41
-42
-43# The entry point of the program need to be protected for spawning processes.
-44if __name__ == '__main__':
-45    main()
+
 1from tensorrt_llm import LLM, SamplingParams
+ 2
+ 3
+ 4def main():
+ 5    # model could accept HF model name or a path to local HF model.
+ 6    llm = LLM(
+ 7        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+ 8        # Enable 2-way tensor parallelism
+ 9        tensor_parallel_size=2
+10        # Enable 2-way pipeline parallelism if needed
+11        # pipeline_parallel_size=2
+12        # Enable 2-way expert parallelism for MoE model's expert weights
+13        # moe_expert_parallel_size=2
+14        # Enable 2-way tensor parallelism for MoE model's expert weights
+15        # moe_tensor_parallel_size=2
+16    )
+17
+18    # Sample prompts.
+19    prompts = [
+20        "Hello, my name is",
+21        "The president of the United States is",
+22        "The capital of France is",
+23        "The future of AI is",
+24    ]
+25
+26    # Create a sampling params.
+27    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+28
+29    for output in llm.generate(prompts, sampling_params):
+30        print(
+31            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+32        )
+33
+34    # Got output like
+35    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+36    # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+37    # Prompt: 'The capital of France is', Generated text: 'Paris.'
+38    # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+39
+40
+41# The entry point of the program need to be protected for spawning processes.
+42if __name__ == '__main__':
+43    main()
 
@@ -573,20 +554,20 @@

previous

-

Generate Text Asynchronously

+

Generate text in streaming

next

-

Control generated text using logits processor

+

Generate text with guided decoding

@@ -690,9 +671,9 @@ diff --git a/latest/examples/llm_inference_kv_events.html b/latest/examples/llm_inference_kv_events.html deleted file mode 100644 index 4ff4837ae7..0000000000 --- a/latest/examples/llm_inference_kv_events.html +++ /dev/null @@ -1,711 +0,0 @@ - - - - - - - - - - - - Get KV Cache Events — TensorRT-LLM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - - - - - - - - - -
- -
- - -
-
- -
-
- -
- -
- - -
- -
- - -
-
- - - - - -
- -
-

Get KV Cache Events#

-

Source NVIDIA/TensorRT-LLM.

-
 1### Get KV Cache Events
- 2
- 3from tensorrt_llm import SamplingParams
- 4from tensorrt_llm._tensorrt_engine import LLM
- 5from tensorrt_llm.llmapi import KvCacheConfig
- 6
- 7
- 8def main():
- 9
-10    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-11              tensor_parallel_size=2,
-12              autotuner_enabled=False,
-13              kv_cache_dtype='auto',
-14              kv_cache_config=KvCacheConfig(enable_block_reuse=True,
-15                                            event_buffer_max_size=1024),
-16              backend="pytorch")
-17
-18    # Sample prompts having a common prefix.
-19    common_prefix = (
-20        "After the ghost's departure, Barnardo notes Horatio's pale appearance and asks if he's okay. "
-21        "Horatio concedes that he's shaken and confesses that, without witnessing the ghost himself, he wouldn't have believed it existed. "
-22        "He's also disturbed by the ghost's striking resemblance to the king. It even seems to be wearing the former king's armor. "
-23        "Horatio thinks the ghost's presence foretells that something is about to go wrong in Denmark. "
-24        "Marcellus concurs with Horatio, as he and the other guards have observed that their schedules have become more rigorous and have also noticed the preparations taking place within Elsinore, including the building of cannons, the storing of weapons, and the preparation of ships."
-25    )
-26    prompts = [
-27        common_prefix, common_prefix + " Marcellus also notes that the king's"
-28    ]
-29
-30    # Create a sampling params.
-31    sampling_params = SamplingParams(temperature=0.001,
-32                                     top_p=0.001,
-33                                     max_tokens=5)
-34
-35    for output in llm.generate(prompts, sampling_params=sampling_params):
-36        print(
-37            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-38        )
-39
-40    kv_events = llm.get_kv_cache_events(10)
-41    print(kv_events)
-42
-43    # Got output like follows:
-44    # [{'event_id': 0, 'data': {'type': 'created', 'num_blocks_per_cache_level': [101230, 0]}},
-45    #  {'event_id': 1, 'data': {'type': 'stored', 'parent_hash': None, 'blocks': [{'type': 'stored_block', 'block_hash': 4203099703668305365, 'tokens': [{'type': 'unique_token', 'token_id': 1, 'token_extra_id': 0}, ...
-46
-47
-48if __name__ == '__main__':
-49    main()
-
-
-
- - -
- - - - - - - -
- - - -
- - - - - -
-
- -
- -
-
-
- - - - - - - - \ No newline at end of file diff --git a/latest/examples/llm_logits_processor.html b/latest/examples/llm_logits_processor.html index d359b10746..2fb2e355b0 100644 --- a/latest/examples/llm_logits_processor.html +++ b/latest/examples/llm_logits_processor.html @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -513,123 +496,131 @@

Control generated text using logits processor#

Source NVIDIA/TensorRT-LLM.

-
  1### Control generated text using logits processor
-  2from typing import List, Optional
-  3
-  4import torch
+
  1from typing import List, Optional
+  2
+  3import torch
+  4from transformers import PreTrainedTokenizer
   5
-  6from tensorrt_llm._tensorrt_engine import LLM
-  7from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
-  8                                          LogitsProcessor, SamplingParams)
+  6from tensorrt_llm import LLM
+  7from tensorrt_llm.sampling_params import LogitsProcessor, SamplingParams
+  8
   9
- 10
- 11# The recommended way to create a customized logits processor:
- 12#     * Subclass LogitsProcessor and implement the processing logics in the __call__ method.
- 13#     * Create an instance and pass to SamplingParams.
- 14# Alternatively, you can create any callable with the same signature with the __call__ method.
- 15# This simple callback will output a specific token at each step irrespective of prompt.
- 16# Refer to ../bindings/executor/example_logits_processor.py for a more
- 17# sophisticated callback that generates JSON structured output.
- 18class MyLogitsProcessor(LogitsProcessor):
- 19
- 20    def __init__(self, allowed_token_id: int):
- 21        self.allowed_token_id = allowed_token_id
- 22
- 23    def __call__(self, req_id: int, logits: torch.Tensor,
- 24                 token_ids: List[List[int]], stream_ptr: int,
- 25                 client_id: Optional[int]):
- 26        mask = torch.full_like(logits, fill_value=float("-inf"), device="cpu")
- 27        mask[:, :, self.allowed_token_id] = 0
- 28
- 29        stream = None if stream_ptr is None else torch.cuda.ExternalStream(
- 30            stream_ptr)
- 31        with torch.cuda.stream(stream):
- 32            mask = mask.to(logits.device, non_blocking=True)
- 33            logits += mask
- 34
- 35
- 36# The recommended way to create a customized batched logits processor:
- 37#     * Subclass BatchedLogitsProcessor and implement the processing logics in the __call__ method.
- 38#     * Create an instance and pass to LLM.
- 39# Alternatively, you can create any callable with the same signature with the __call__ method.
- 40# A batched logits processor's arguments for all requests in a batch are made available as lists.
- 41# This helps user optimize the callback for large batch sizes. For example:
- 42# 1. Process more work on host, e.g. running a JSON state machine, in parallel with model forward pass on device.
- 43# 2. Coalesce H2D memory transfers for all requests into a single cudaMemcpyAsync call.
- 44# 3. Launch a single batched kernel, e.g. for updating logits on device.
- 45class MyBatchedLogitsProcessor(BatchedLogitsProcessor):
+ 10def text_to_token(tokenizer: PreTrainedTokenizer, text: str, last: bool):
+ 11    tokens = tokenizer.encode(text, add_special_tokens=False)
+ 12
+ 13    max_token_count = 1
+ 14    bos_token_added = getattr(tokenizer, 'bos_token', None) and getattr(
+ 15        tokenizer, 'bos_token_id', None) in tokens
+ 16    prefix_token_added = getattr(tokenizer, 'add_prefix_space',
+ 17                                 None) is not False
+ 18    if bos_token_added or prefix_token_added:
+ 19        max_token_count = 2
+ 20
+ 21    if not last and len(tokens) > max_token_count:
+ 22        raise Exception(
+ 23            f"Can't convert {text} to token. It has {len(tokens)} tokens.")
+ 24
+ 25    return tokens[-1]
+ 26
+ 27
+ 28# The recommended way to create a customized logits processor:
+ 29#     * Subclass LogitsProcessor and implement the processing logics in the __call__ method.
+ 30#     * Create an instance and pass to SamplingParams.
+ 31# More LogitsProcessors references can be found at https://github.com/NVIDIA/logits-processor-zoo.
+ 32class GenLengthLogitsProcessor(LogitsProcessor):
+ 33    """
+ 34    A logits processor that adjusts the likelihood of the end-of-sequence (EOS) token
+ 35    based on the length of the generated sequence, encouraging or discouraging shorter answers.
+ 36    WARNING: Create a new object before every model.generate call since token_count is accumulated.
+ 37
+ 38    Parameters
+ 39    ----------
+ 40    tokenizer: The tokenizer used by the LLM.
+ 41    boost_factor (float): A factor to boost the likelihood of the EOS token as the sequence length increases.
+ 42                        Suggested value range is [-1.0, 1.0]. Negative values are used for the opposite effect.
+ 43    p (int, optional): The power to which the token count is raised when computing the boost value. Default is 2.
+ 44    complete_sentences (bool, optional): If True, boosts EOS token likelihood only when the last token is a full stop
+ 45                                        or a new line. Default is False.
  46
- 47    def __init__(self, allowed_token_id: int):
- 48        self.allowed_token_id = allowed_token_id
- 49
- 50    def __call__(self, req_ids: List[int], logits: List[torch.Tensor],
- 51                 token_ids: List[List[List[int]]], stream_ptr: int,
- 52                 client_ids: List[Optional[int]]):
- 53        # Generate masks for all requests on host
- 54        masks = []
- 55        for req_id, req_logits, req_token_ids, client_id in zip(
- 56                req_ids, logits, token_ids, client_ids):
- 57            mask = torch.full_like(req_logits,
- 58                                   fill_value=float("-inf"),
- 59                                   device="cpu")
- 60            mask[:, :, self.allowed_token_id] = 0
- 61            masks.append(mask)
- 62
- 63        # Move masks to device and add to logits using non-blocking operations
- 64        with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)):
- 65            for req_logits, mask in zip(logits, masks):
- 66                req_logits += mask.to(req_logits.device, non_blocking=True)
- 67
- 68
- 69def main():
+ 47    """
+ 48
+ 49    def __init__(self,
+ 50                 tokenizer,
+ 51                 boost_factor: float,
+ 52                 p: int = 2,
+ 53                 complete_sentences: bool = False):
+ 54        self.eos_token = tokenizer.eos_token_id
+ 55        self.boost_factor = boost_factor
+ 56        self.p = p
+ 57        self.token_count = 0
+ 58        self.full_stop_token = text_to_token(tokenizer,
+ 59                                             "It is a sentence.",
+ 60                                             last=True)
+ 61        self.new_line_token = text_to_token(tokenizer,
+ 62                                            "It is a new line\n",
+ 63                                            last=True)
+ 64        self.complete_sentences = complete_sentences
+ 65
+ 66    def __call__(self, req_ids: int, logits: torch.Tensor, ids: List[List[int]],
+ 67                 stream_ptr, client_id: Optional[int]):
+ 68        boost_val = self.boost_factor * (self.token_count**self.p) / (10**
+ 69                                                                      self.p)
  70
- 71    # Batched logits processor (only supported in TensorRT backend)
- 72    # should be specified when initializing LLM.
- 73    llm = LLM(
- 74        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
- 75        batched_logits_processor=MyBatchedLogitsProcessor(allowed_token_id=42))
+ 71        stream = None if stream_ptr is None else torch.cuda.ExternalStream(
+ 72            stream_ptr)
+ 73
+ 74        with torch.cuda.stream(stream):
+ 75            ids = torch.LongTensor(ids).to(logits.device, non_blocking=True)
  76
- 77    # Sample prompts
- 78    prompts = [
- 79        "Hello, my name is",
- 80        "The president of the United States is",
- 81    ]
- 82
- 83    # Generate text
- 84    for prompt_id, prompt in enumerate(prompts):
- 85        # Use non-batched logits processor callback only for odd-numbered prompts
- 86        if prompt_id % 2 == 0:
- 87            sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
- 88        else:
- 89            # Each prompt can be specified with a logits processor at runtime
- 90            sampling_params = SamplingParams(
- 91                temperature=0.8,
- 92                top_p=0.95,
- 93                logits_processor=MyLogitsProcessor(allowed_token_id=42))
- 94
- 95        for output in llm.generate([prompt], sampling_params):
- 96            print(
- 97                f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
- 98            )
- 99
-100    # Got output like
-101    # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-102    # Prompt: 'The president of the United States is', Generated text: "''''''''''''''''''''''''''''''''"
-103
-104    # Use batched processor with batch size = 2
-105    sampling_params = SamplingParams(apply_batched_logits_processor=True)
-106    for output in llm.generate(prompts, sampling_params):
-107        print(
-108            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-109        )
-110
-111    # Got output like
-112    # Prompt: 'Hello, my name is', Generated text: "''''''''''''''''''''''''''''''''"
-113    # Prompt: 'The president of the United States is', Generated text: "''''''''''''''''''''''''''''''''"
+ 77            if self.complete_sentences:
+ 78                enabled = (ids[:, -1] == self.full_stop_token) | (
+ 79                    ids[:, -1] == self.new_line_token)
+ 80                logits[:, :, self.eos_token] += enabled * boost_val
+ 81            else:
+ 82                logits[:, :, self.eos_token] += boost_val
+ 83
+ 84        self.token_count += 1
+ 85
+ 86
+ 87def main():
+ 88
+ 89    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+ 90
+ 91    # Sample prompts
+ 92    prompts = [
+ 93        "The future of AI is",
+ 94        "The future of AI is",
+ 95    ]
+ 96
+ 97    # Generate text
+ 98    for prompt_id, prompt in enumerate(prompts):
+ 99        if prompt_id % 2 == 0:
+100            # Without logit processor
+101            sampling_params = SamplingParams(top_p=1, max_tokens=200)
+102        else:
+103            # Each prompt can be specified with a logits processor at runtime
+104            sampling_params = SamplingParams(
+105                temperature=0.8,
+106                top_p=0.95,
+107                logits_processor=GenLengthLogitsProcessor(
+108                    llm.tokenizer, boost_factor=1, complete_sentences=True))
+109
+110        output = llm.generate(prompt, sampling_params)
+111        print(
+112            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+113        )
 114
-115
-116if __name__ == '__main__':
-117    main()
+115    # Got output like:
+116    # Prompt (original): "bright, and it's not just for big companies. Small businesses can also benefit from AI technology. Here are some ways:\n\n1. Improved customer service: AI can help businesses provide better customer service by analyzing customer data and providing personalized recommendations.
+117    #                    This can help businesses improve their customer experience and increase customer loyalty.\n\n2. Increased productivity: AI can help businesses automate repetitive tasks, freeing up employees to focus on more complex tasks. This can
+118    #                    help businesses increase productivity and reduce costs.\n\n3. Enhanced marketing: AI can help businesses create more personalized marketing campaigns by analyzing customer data and targeting specific audiences. This can help businesses
+119    #                    increase their marketing ROI and drive more sales.\n\n4. Improved supply chain management: AI can help businesses optimize their supply chain by analyzing data on demand,"'
+120    #
+121    # Prompt (with GenLenthLogitsProcesor): "bright, and it's not just for big companies. Small businesses can also benefit from AI technology."
+122
+123
+124if __name__ == '__main__':
+125    main()
 
@@ -645,20 +636,20 @@

previous

-

Distributed LLM Generation

+

Generate text with guided decoding

next

-

Generate Text Using Eagle2 Decoding

+

Generate text with multiple LoRA adapters

@@ -762,9 +753,9 @@ diff --git a/latest/examples/llm_medusa_decoding.html b/latest/examples/llm_medusa_decoding.html deleted file mode 100644 index 3909b68f13..0000000000 --- a/latest/examples/llm_medusa_decoding.html +++ /dev/null @@ -1,756 +0,0 @@ - - - - - - - - - - - - Generate Text Using Medusa Decoding — TensorRT-LLM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - - - - - - - - - -
- -
- - -
-
- -
-
- -
- -
- - -
- -
- - -
-
- - - - - -
- -
-

Generate Text Using Medusa Decoding#

-

Source NVIDIA/TensorRT-LLM.

-
 1### Generate Text Using Medusa Decoding
- 2import argparse
- 3from pathlib import Path
- 4
- 5from tensorrt_llm._tensorrt_engine import LLM
- 6from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
- 7                                 MedusaDecodingConfig, SamplingParams)
- 8
- 9
-10def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None):
-11    # Sample prompts.
-12    prompts = [
-13        "Hello, my name is",
-14        "The president of the United States is",
-15        "The capital of France is",
-16        "The future of AI is",
-17    ]
-18    # The end user can customize the sampling configuration with the SamplingParams class
-19    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-20
-21    # The end user can customize the build configuration with the BuildConfig class
-22    build_config = BuildConfig(
-23        max_batch_size=1,
-24        max_seq_len=1024,
-25    )
-26
-27    # The end user can customize the kv cache configuration with the KVCache class
-28    kv_cache_config = KvCacheConfig(enable_block_reuse=True)
-29
-30    llm_kwargs = {}
-31
-32    if use_modelopt_ckpt:
-33        # This is a Llama-3.1-8B combined with Medusa heads provided by TensorRT Model Optimizer.
-34        # Both the base model (except lm_head) and Medusa heads have been quantized in FP8.
-35        model = model_dir or "nvidia/Llama-3.1-8B-Medusa-FP8"
-36
-37        # ModelOpt ckpt uses 3 Medusa heads
-38        speculative_config = MedusaDecodingConfig(
-39                            max_draft_len=63,
-40                            num_medusa_heads=3,
-41                            medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], \
-42                                [4], [0, 4], [2, 0], [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], \
-43                                    [7], [0, 8], [0, 0, 2], [3, 0], [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], \
-44                                        [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], \
-45                                            [0, 0, 8], [0, 0, 9], [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [1, 6], [0, 7, 0]]
-46        )
-47    else:
-48        # In this path, base model and Medusa heads are stored and loaded separately.
-49        model = "lmsys/vicuna-7b-v1.3"
-50
-51        # The end user can customize the medusa decoding configuration by specifying the
-52        # speculative_model, max_draft_len, medusa heads num and medusa choices
-53        # with the MedusaDecodingConfig class
-54        speculative_config = MedusaDecodingConfig(
-55                                        speculative_model="FasterDecoding/medusa-vicuna-7b-v1.3",
-56                                        max_draft_len=63,
-57                                        num_medusa_heads=4,
-58                                        medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
-59                                                [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \
-60                                                [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \
-61                                                [0, 0, 0, 0], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], \
-62                                                [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], \
-63                                                [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]
-64        )
-65
-66    # Add 'tensor_parallel_size=2' if using ckpt for
-67    # a larger model like nvidia/Llama-3.1-70B-Medusa.
-68    llm = LLM(model=model,
-69              build_config=build_config,
-70              kv_cache_config=kv_cache_config,
-71              speculative_config=speculative_config,
-72              **llm_kwargs)
-73
-74    outputs = llm.generate(prompts, sampling_params)
-75
-76    # Print the outputs.
-77    for output in outputs:
-78        prompt = output.prompt
-79        generated_text = output.outputs[0].text
-80        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-81
-82
-83if __name__ == '__main__':
-84    parser = argparse.ArgumentParser(
-85        description="Generate text using Medusa decoding.")
-86    parser.add_argument(
-87        '--use_modelopt_ckpt',
-88        action='store_true',
-89        help="Use FP8-quantized checkpoint from TensorRT Model Optimizer.")
-90    # TODO: remove this arg after ModelOpt ckpt is public on HF
-91    parser.add_argument('--model_dir', type=Path, default=None)
-92    args = parser.parse_args()
-93
-94    run_medusa_decoding(args.use_modelopt_ckpt, args.model_dir)
-
-
-
- - -
- - - - - - - -
- - - -
- - - - - -
-
- -
- -
-
-
- - - - - - - - \ No newline at end of file diff --git a/latest/examples/llm_mgmn_llm_distributed.html b/latest/examples/llm_mgmn_llm_distributed.html index bcffc7559a..9449fd323d 100644 --- a/latest/examples/llm_mgmn_llm_distributed.html +++ b/latest/examples/llm_mgmn_llm_distributed.html @@ -9,7 +9,7 @@ - Llm Mgmn Llm Distributed — TensorRT-LLM + Run LLM-API with pytorch backend on Slurm — TensorRT-LLM @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -510,8 +493,8 @@
-
-

Llm Mgmn Llm Distributed#

+
+

Run LLM-API with pytorch backend on Slurm#

Source NVIDIA/TensorRT-LLM.

 1#!/bin/bash
  2#SBATCH -A <account>    # parameter
@@ -523,49 +506,48 @@
  8#SBATCH -e logs/llmapi-distributed.err
  9#SBATCH -J llmapi-distributed-task
 10
-11### Run LLM-API with pytorch backend on Slurm
-12
-13# NOTE, this feature is experimental and may not work on all systems.
-14# The trtllm-llmapi-launch is a script that launches the LLM-API code on
-15# Slurm-like systems, and can support multi-node and multi-GPU setups.
-16
-17# Note that, the number of MPI processes should be the same as the model world
-18# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
-19# each, or 4 nodes with 4 gpus for each or other combinations.
-20
-21# This docker image should have tensorrt_llm installed, or you need to install
-22# it in the task.
-23
-24# The following variables are expected to be set in the environment:
-25# You can set them via --export in the srun/sbatch command.
-26#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
-27#   MOUNT_DIR: the directory to mount in the container
-28#   MOUNT_DEST: the destination directory in the container
-29#   WORKDIR: the working directory in the container
-30#   SOURCE_ROOT: the path to the TensorRT-LLM source
-31#   PROLOGUE: the prologue to run before the script
-32#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
-33#      not supported in Slurm mode, you need to download the model and put it in
-34#      the LOCAL_MODEL directory.
-35
-36# Adjust the paths to run
-37export script=$SOURCE_ROOT/examples/pytorch/quickstart_advanced.py
-38
-39# Just launch the PyTorch example with trtllm-llmapi-launch command.
-40srun -l \
-41    --container-image=${CONTAINER_IMAGE} \
-42    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
-43    --container-workdir=${WORKDIR} \
-44    --export=ALL \
-45    --mpi=pmix \
-46    bash -c "
-47        $PROLOGUE
-48        export PATH=$PATH:~/.local/bin
-49        trtllm-llmapi-launch python3 $script \
-50            --model_dir $LOCAL_MODEL \
-51            --prompt 'Hello, how are you?' \
-52            --tp_size 2
-53    "
+11
+12# NOTE, this feature is experimental and may not work on all systems.
+13# The trtllm-llmapi-launch is a script that launches the LLM-API code on
+14# Slurm-like systems, and can support multi-node and multi-GPU setups.
+15
+16# Note that, the number of MPI processes should be the same as the model world
+17# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
+18# each, or 4 nodes with 4 gpus for each or other combinations.
+19
+20# This docker image should have tensorrt_llm installed, or you need to install
+21# it in the task.
+22
+23# The following variables are expected to be set in the environment:
+24# You can set them via --export in the srun/sbatch command.
+25#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
+26#   MOUNT_DIR: the directory to mount in the container
+27#   MOUNT_DEST: the destination directory in the container
+28#   WORKDIR: the working directory in the container
+29#   SOURCE_ROOT: the path to the TensorRT-LLM source
+30#   PROLOGUE: the prologue to run before the script
+31#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
+32#      not supported in Slurm mode, you need to download the model and put it in
+33#      the LOCAL_MODEL directory.
+34
+35# Adjust the paths to run
+36export script=$SOURCE_ROOT/examples/pytorch/quickstart_advanced.py
+37
+38# Just launch the PyTorch example with trtllm-llmapi-launch command.
+39srun -l \
+40    --container-image=${CONTAINER_IMAGE} \
+41    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
+42    --container-workdir=${WORKDIR} \
+43    --export=ALL \
+44    --mpi=pmix \
+45    bash -c "
+46        $PROLOGUE
+47        export PATH=$PATH:~/.local/bin
+48        trtllm-llmapi-launch python3 $script \
+49            --model_dir $LOCAL_MODEL \
+50            --prompt 'Hello, how are you?' \
+51            --tp_size 2
+52    "
 
@@ -581,12 +563,12 @@

previous

-

Automatic Parallelism with LLM

+

Generate text with multiple LoRA adapters

next

-

Llm Mgmn Trtllm Bench

+

Run trtllm-bench with pytorch backend on Slurm

@@ -698,9 +680,9 @@ diff --git a/latest/examples/llm_mgmn_trtllm_bench.html b/latest/examples/llm_mgmn_trtllm_bench.html index cb6e79521e..4721691b0d 100644 --- a/latest/examples/llm_mgmn_trtllm_bench.html +++ b/latest/examples/llm_mgmn_trtllm_bench.html @@ -9,7 +9,7 @@ - Llm Mgmn Trtllm Bench — TensorRT-LLM + Run trtllm-bench with pytorch backend on Slurm — TensorRT-LLM @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -510,8 +493,8 @@
-
-

Llm Mgmn Trtllm Bench#

+
+

Run trtllm-bench with pytorch backend on Slurm#

Source NVIDIA/TensorRT-LLM.

 1#!/bin/bash
  2#SBATCH -A <account>
@@ -523,90 +506,87 @@
  8#SBATCH -e logs/trtllm-bench.err
  9#SBATCH -J trtllm-bench
 10
-11### Run trtllm-bench with pytorch backend on Slurm
-12
-13# NOTE, this feature is experimental and may not work on all systems.
-14# The trtllm-llmapi-launch is a script that launches the LLM-API code on
-15# Slurm-like systems, and can support multi-node and multi-GPU setups.
-16
-17# Note that, the number of MPI processes should be the same as the model world
-18# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
-19# each, or 4 nodes with 4 gpus for each or other combinations.
-20
-21# This docker image should have tensorrt_llm installed, or you need to install
-22# it in the task.
-23
-24# The following variables are expected to be set in the environment:
-25# You can set them via --export in the srun/sbatch command.
-26#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
-27#   MOUNT_DIR: the directory to mount in the container
-28#   MOUNT_DEST: the destination directory in the container
-29#   WORKDIR: the working directory in the container
-30#   SOURCE_ROOT: the path to the TensorRT-LLM source
-31#   PROLOGUE: the prologue to run before the script
-32#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
-33#      not supported in Slurm mode, you need to download the model and put it in
-34#      the LOCAL_MODEL directory.
-35
-36export prepare_dataset="$SOURCE_ROOT/benchmarks/cpp/prepare_dataset.py"
-37export data_path="$WORKDIR/token-norm-dist.txt"
-38
-39echo "Preparing dataset..."
-40srun -l \
-41    -N 1 \
-42    -n 1 \
-43    --container-image=${CONTAINER_IMAGE} \
-44    --container-name="prepare-name" \
-45    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
-46    --container-workdir=${WORKDIR} \
-47    --export=ALL \
-48    --mpi=pmix \
-49    bash -c "
-50        $PROLOGUE
-51        python3 $prepare_dataset \
-52            --tokenizer=$LOCAL_MODEL \
-53            --stdout token-norm-dist \
-54            --num-requests=100 \
-55            --input-mean=128 \
-56            --output-mean=128 \
-57            --input-stdev=0 \
-58            --output-stdev=0 > $data_path
-59    "
-60
-61echo "Running benchmark..."
-62# Just launch trtllm-bench job with trtllm-llmapi-launch command.
-63
-64srun -l \
-65    --container-image=${CONTAINER_IMAGE} \
-66    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
-67    --container-workdir=${WORKDIR} \
-68    --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
-69    --mpi=pmix \
-70    bash -c "
-71        set -ex
-72        $PROLOGUE
-73        export PATH=$PATH:~/.local/bin
-74
-75        # This is optional
-76        cat > /tmp/pytorch_extra_args.txt << EOF
-77use_cuda_graph: false
-78cuda_graph_padding_enabled: false
-79print_iter_log: true
-80enable_attention_dp: false
-81EOF
-82
-83        # launch the benchmark
-84        trtllm-llmapi-launch \
-85         trtllm-bench \
-86            --model $MODEL_NAME \
-87            --model_path $LOCAL_MODEL \
-88            throughput \
-89            --dataset $data_path \
-90            --backend pytorch \
-91            --tp 16 \
-92            --extra_llm_api_options /tmp/pytorch_extra_args.txt \
-93            $EXTRA_ARGS
-94    "
+11
+12# NOTE, this feature is experimental and may not work on all systems.
+13# The trtllm-llmapi-launch is a script that launches the LLM-API code on
+14# Slurm-like systems, and can support multi-node and multi-GPU setups.
+15
+16# Note that, the number of MPI processes should be the same as the model world
+17# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
+18# each, or 4 nodes with 4 gpus for each or other combinations.
+19
+20# This docker image should have tensorrt_llm installed, or you need to install
+21# it in the task.
+22
+23# The following variables are expected to be set in the environment:
+24# You can set them via --export in the srun/sbatch command.
+25#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
+26#   MOUNT_DIR: the directory to mount in the container
+27#   MOUNT_DEST: the destination directory in the container
+28#   WORKDIR: the working directory in the container
+29#   SOURCE_ROOT: the path to the TensorRT-LLM source
+30#   PROLOGUE: the prologue to run before the script
+31#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
+32#      not supported in Slurm mode, you need to download the model and put it in
+33#      the LOCAL_MODEL directory.
+34
+35export prepare_dataset="$SOURCE_ROOT/benchmarks/cpp/prepare_dataset.py"
+36export data_path="$WORKDIR/token-norm-dist.txt"
+37
+38echo "Preparing dataset..."
+39srun -l \
+40    -N 1 \
+41    -n 1 \
+42    --container-image=${CONTAINER_IMAGE} \
+43    --container-name="prepare-name" \
+44    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
+45    --container-workdir=${WORKDIR} \
+46    --export=ALL \
+47    --mpi=pmix \
+48    bash -c "
+49        $PROLOGUE
+50        python3 $prepare_dataset \
+51            --tokenizer=$LOCAL_MODEL \
+52            --stdout token-norm-dist \
+53            --num-requests=100 \
+54            --input-mean=128 \
+55            --output-mean=128 \
+56            --input-stdev=0 \
+57            --output-stdev=0 > $data_path
+58    "
+59
+60echo "Running benchmark..."
+61# Just launch trtllm-bench job with trtllm-llmapi-launch command.
+62
+63srun -l \
+64    --container-image=${CONTAINER_IMAGE} \
+65    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
+66    --container-workdir=${WORKDIR} \
+67    --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
+68    --mpi=pmix \
+69    bash -c "
+70        set -ex
+71        $PROLOGUE
+72        export PATH=$PATH:~/.local/bin
+73
+74        # This is optional
+75        cat > /tmp/pytorch_extra_args.txt << EOF
+76print_iter_log: true
+77enable_attention_dp: false
+78EOF
+79
+80        # launch the benchmark
+81        trtllm-llmapi-launch \
+82         trtllm-bench \
+83            --model $MODEL_NAME \
+84            --model_path $LOCAL_MODEL \
+85            throughput \
+86            --dataset $data_path \
+87            --backend pytorch \
+88            --tp 16 \
+89            --extra_llm_api_options /tmp/pytorch_extra_args.txt \
+90            $EXTRA_ARGS
+91    "
 
@@ -627,7 +607,7 @@

previous

-

Llm Mgmn Llm Distributed

+

Run LLM-API with pytorch backend on Slurm

next

-

Llm Mgmn Trtllm Serve

+

Run trtllm-serve with pytorch backend on Slurm

@@ -739,9 +719,9 @@ diff --git a/latest/examples/llm_mgmn_trtllm_serve.html b/latest/examples/llm_mgmn_trtllm_serve.html index 90b29ee908..71585a394b 100644 --- a/latest/examples/llm_mgmn_trtllm_serve.html +++ b/latest/examples/llm_mgmn_trtllm_serve.html @@ -9,7 +9,7 @@ - Llm Mgmn Trtllm Serve — TensorRT-LLM + Run trtllm-serve with pytorch backend on Slurm — TensorRT-LLM @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -510,8 +493,8 @@
-
-

Llm Mgmn Trtllm Serve#

+
+

Run trtllm-serve with pytorch backend on Slurm#

Source NVIDIA/TensorRT-LLM.

 1#!/bin/bash
  2#SBATCH -A <account>
@@ -523,51 +506,50 @@
  8#SBATCH -e logs/trtllm-serve.err
  9#SBATCH -J trtllm-serve
 10
-11### Run trtllm-serve with pytorch backend on Slurm
-12
-13# NOTE, this feature is experimental and may not work on all systems.
-14# The trtllm-llmapi-launch is a script that launches the LLM-API code on
-15# Slurm-like systems, and can support multi-node and multi-GPU setups.
-16
-17# Note that, the number of MPI processes should be the same as the model world
-18# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
-19# each, or 4 nodes with 4 gpus for each or other combinations.
-20
-21# This docker image should have tensorrt_llm installed, or you need to install
-22# it in the task.
-23
-24# The following variables are expected to be set in the environment:
-25# You can set them via --export in the srun/sbatch command.
-26#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
-27#   MOUNT_DIR: the directory to mount in the container
-28#   MOUNT_DEST: the destination directory in the container
-29#   WORKDIR: the working directory in the container
-30#   SOURCE_ROOT: the path to the TensorRT-LLM source
-31#   PROLOGUE: the prologue to run before the script
-32#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
-33#      not supported in Slurm mode, you need to download the model and put it in
-34#      the LOCAL_MODEL directory.
-35
-36echo "Starting trtllm-serve..."
-37# Just launch trtllm-serve job with trtllm-llmapi-launch command.
-38srun -l \
-39    --container-image=${CONTAINER_IMAGE} \
-40    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
-41    --container-workdir=${WORKDIR} \
-42    --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
-43    --mpi=pmix \
-44    bash -c "
-45        set -ex
-46        $PROLOGUE
-47        export PATH=$PATH:~/.local/bin
-48
-49        trtllm-llmapi-launch \
-50         trtllm-serve $LOCAL_MODEL \
-51            --tp_size 16 \
-52            --backend pytorch \
-53            --host 0.0.0.0 \
-54            ${ADDITIONAL_OPTIONS}
-55    "
+11
+12# NOTE, this feature is experimental and may not work on all systems.
+13# The trtllm-llmapi-launch is a script that launches the LLM-API code on
+14# Slurm-like systems, and can support multi-node and multi-GPU setups.
+15
+16# Note that, the number of MPI processes should be the same as the model world
+17# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for
+18# each, or 4 nodes with 4 gpus for each or other combinations.
+19
+20# This docker image should have tensorrt_llm installed, or you need to install
+21# it in the task.
+22
+23# The following variables are expected to be set in the environment:
+24# You can set them via --export in the srun/sbatch command.
+25#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.
+26#   MOUNT_DIR: the directory to mount in the container
+27#   MOUNT_DEST: the destination directory in the container
+28#   WORKDIR: the working directory in the container
+29#   SOURCE_ROOT: the path to the TensorRT-LLM source
+30#   PROLOGUE: the prologue to run before the script
+31#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
+32#      not supported in Slurm mode, you need to download the model and put it in
+33#      the LOCAL_MODEL directory.
+34
+35echo "Starting trtllm-serve..."
+36# Just launch trtllm-serve job with trtllm-llmapi-launch command.
+37srun -l \
+38    --container-image=${CONTAINER_IMAGE} \
+39    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
+40    --container-workdir=${WORKDIR} \
+41    --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
+42    --mpi=pmix \
+43    bash -c "
+44        set -ex
+45        $PROLOGUE
+46        export PATH=$PATH:~/.local/bin
+47
+48        trtllm-llmapi-launch \
+49         trtllm-serve $LOCAL_MODEL \
+50            --tp_size 16 \
+51            --backend pytorch \
+52            --host 0.0.0.0 \
+53            ${ADDITIONAL_OPTIONS}
+54    "
 
@@ -588,15 +570,15 @@

previous

-

Llm Mgmn Trtllm Bench

+

Run trtllm-bench with pytorch backend on Slurm

next

-

LLM Common Customizations

+

Online Serving Examples

@@ -700,9 +682,9 @@ diff --git a/latest/examples/llm_multilora.html b/latest/examples/llm_multilora.html index 25a11b9bbe..3ccb4c7b21 100644 --- a/latest/examples/llm_multilora.html +++ b/latest/examples/llm_multilora.html @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -513,66 +496,65 @@

Generate text with multiple LoRA adapters#

Source NVIDIA/TensorRT-LLM.

-
 1### Generate text with multiple LoRA adapters
- 2from huggingface_hub import snapshot_download
- 3
- 4from tensorrt_llm._tensorrt_engine import LLM
- 5from tensorrt_llm.executor import LoRARequest
- 6from tensorrt_llm.llmapi import BuildConfig
- 7from tensorrt_llm.lora_manager import LoraConfig
+
 1from huggingface_hub import snapshot_download
+ 2
+ 3from tensorrt_llm import LLM
+ 4from tensorrt_llm.executor import LoRARequest
+ 5from tensorrt_llm.llmapi import BuildConfig
+ 6from tensorrt_llm.lora_manager import LoraConfig
+ 7
  8
- 9
-10def main():
-11
-12    # Download the LoRA adapters from huggingface hub.
-13    lora_dir1 = snapshot_download(repo_id="snshrivas10/sft-tiny-chatbot")
-14    lora_dir2 = snapshot_download(
-15        repo_id="givyboy/TinyLlama-1.1B-Chat-v1.0-mental-health-conversational")
-16    lora_dir3 = snapshot_download(repo_id="barissglc/tinyllama-tarot-v1")
-17
-18    # Currently, we need to pass at least one lora_dir to LLM constructor via build_config.lora_config.
-19    # This is necessary because it requires some configuration in the lora_dir to build the engine with LoRA support.
-20    build_config = BuildConfig()
-21    build_config.lora_config = LoraConfig(lora_dir=[lora_dir1])
-22    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-23              enable_lora=True,
-24              max_lora_rank=64,
-25              build_config=build_config)
-26
-27    # Sample prompts
-28    prompts = [
+ 9def main():
+10
+11    # Download the LoRA adapters from huggingface hub.
+12    lora_dir1 = snapshot_download(repo_id="snshrivas10/sft-tiny-chatbot")
+13    lora_dir2 = snapshot_download(
+14        repo_id="givyboy/TinyLlama-1.1B-Chat-v1.0-mental-health-conversational")
+15    lora_dir3 = snapshot_download(repo_id="barissglc/tinyllama-tarot-v1")
+16
+17    # Currently, we need to pass at least one lora_dir to LLM constructor via build_config.lora_config.
+18    # This is necessary because it requires some configuration in the lora_dir to build the engine with LoRA support.
+19    build_config = BuildConfig()
+20    build_config.lora_config = LoraConfig(lora_dir=[lora_dir1])
+21    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+22              enable_lora=True,
+23              max_lora_rank=64,
+24              build_config=build_config)
+25
+26    # Sample prompts
+27    prompts = [
+28        "Hello, tell me a story: ",
 29        "Hello, tell me a story: ",
-30        "Hello, tell me a story: ",
+30        "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?",
 31        "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?",
-32        "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?",
+32        "In this reading, the Justice card represents a situation where",
 33        "In this reading, the Justice card represents a situation where",
-34        "In this reading, the Justice card represents a situation where",
-35    ]
-36
-37    # At runtime, multiple LoRA adapters can be specified via lora_request; None means no LoRA used.
-38    for output in llm.generate(prompts,
-39                               lora_request=[
-40                                   None,
-41                                   LoRARequest("chatbot", 1, lora_dir1), None,
-42                                   LoRARequest("mental-health", 2, lora_dir2),
-43                                   None,
-44                                   LoRARequest("tarot", 3, lora_dir3)
-45                               ]):
-46        prompt = output.prompt
-47        generated_text = output.outputs[0].text
-48        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-49
-50    # Got output like
-51    # Prompt: 'Hello, tell me a story: ', Generated text: '1. Start with a question: "What\'s your favorite color?" 2. Ask a question that leads to a story: "What\'s your'
-52    # Prompt: 'Hello, tell me a story: ', Generated text: '1. A person is walking down the street. 2. A person is sitting on a bench. 3. A person is reading a book.'
-53    # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (smiling) No, I'm just feeling a bit overwhelmed lately. I've been trying to"
-54    # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (sighs) Yeah, I've been struggling with some personal issues. I've been feeling like I'm"
-55    # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. The card suggests that you should take the time to consider all the options'
-56    # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. It is important to take the time to consider all the options and make'
+34    ]
+35
+36    # At runtime, multiple LoRA adapters can be specified via lora_request; None means no LoRA used.
+37    for output in llm.generate(prompts,
+38                               lora_request=[
+39                                   None,
+40                                   LoRARequest("chatbot", 1, lora_dir1), None,
+41                                   LoRARequest("mental-health", 2, lora_dir2),
+42                                   None,
+43                                   LoRARequest("tarot", 3, lora_dir3)
+44                               ]):
+45        prompt = output.prompt
+46        generated_text = output.outputs[0].text
+47        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+48
+49    # Got output like
+50    # Prompt: 'Hello, tell me a story: ', Generated text: '1. Start with a question: "What\'s your favorite color?" 2. Ask a question that leads to a story: "What\'s your'
+51    # Prompt: 'Hello, tell me a story: ', Generated text: '1. A person is walking down the street. 2. A person is sitting on a bench. 3. A person is reading a book.'
+52    # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (smiling) No, I'm just feeling a bit overwhelmed lately. I've been trying to"
+53    # Prompt: "I've noticed you seem a bit down lately. Is there anything you'd like to talk about?", Generated text: "\n\nJASON: (sighs) Yeah, I've been struggling with some personal issues. I've been feeling like I'm"
+54    # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. The card suggests that you should take the time to consider all the options'
+55    # Prompt: 'In this reading, the Justice card represents a situation where', Generated text: 'you are being asked to make a decision that will have a significant impact on your life. It is important to take the time to consider all the options and make'
+56
 57
-58
-59if __name__ == '__main__':
-60    main()
+58if __name__ == '__main__':
+59    main()
 
@@ -588,20 +570,20 @@

previous

-

Generate Text Using Medusa Decoding

+

Control generated text using logits processor

next

-

Generate Text Using Eagle Decoding

+

Run LLM-API with pytorch backend on Slurm

@@ -705,9 +687,9 @@ diff --git a/latest/examples/llm_quantization.html b/latest/examples/llm_quantization.html deleted file mode 100644 index c21c2bc380..0000000000 --- a/latest/examples/llm_quantization.html +++ /dev/null @@ -1,744 +0,0 @@ - - - - - - - - - - - - Generation with Quantization — TensorRT-LLM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - -
- -
- - - - - -
-
- - - - -
- - - - - - - - - - - - - - - - - - - - - -
- -
- - -
-
- -
-
- -
- -
- - -
- -
- - -
-
- - - - - -
- -
-

Generation with Quantization#

-

Source NVIDIA/TensorRT-LLM.

-
 1### Generation with Quantization
- 2import logging
- 3
- 4import torch
- 5
- 6from tensorrt_llm import SamplingParams
- 7from tensorrt_llm._tensorrt_engine import LLM
- 8from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
- 9
-10major, minor = torch.cuda.get_device_capability()
-11enable_fp8 = major > 8 or (major == 8 and minor >= 9)
-12enable_nvfp4 = major >= 10
-13
-14quant_and_calib_configs = []
-15
-16if not enable_nvfp4:
-17    # Example 1: Specify int4 AWQ quantization to QuantConfig.
-18    # We can skip specifying CalibConfig or leave a None as the default value.
-19    quant_and_calib_configs.append(
-20        (QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ), None))
-21
-22if enable_fp8:
-23    # Example 2: Specify FP8 quantization to QuantConfig.
-24    # We can create a CalibConfig to specify the calibration dataset and other details.
-25    # Note that the calibration dataset could be either HF dataset name or a path to local HF dataset.
-26    quant_and_calib_configs.append(
-27        (QuantConfig(quant_algo=QuantAlgo.FP8,
-28                     kv_cache_quant_algo=QuantAlgo.FP8),
-29         CalibConfig(calib_dataset='cnn_dailymail',
-30                     calib_batches=256,
-31                     calib_max_seq_length=256)))
-32else:
-33    logging.error(
-34        "FP8 quantization only works on post-ada GPUs. Skipped in the example.")
-35
-36if enable_nvfp4:
-37    # Example 3: Specify NVFP4 quantization to QuantConfig.
-38    quant_and_calib_configs.append(
-39        (QuantConfig(quant_algo=QuantAlgo.NVFP4,
-40                     kv_cache_quant_algo=QuantAlgo.FP8),
-41         CalibConfig(calib_dataset='cnn_dailymail',
-42                     calib_batches=256,
-43                     calib_max_seq_length=256)))
-44else:
-45    logging.error(
-46        "NVFP4 quantization only works on Blackwell. Skipped in the example.")
-47
-48
-49def main():
-50
-51    for quant_config, calib_config in quant_and_calib_configs:
-52        # The built-in end-to-end quantization is triggered according to the passed quant_config.
-53        llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-54                  quant_config=quant_config,
-55                  calib_config=calib_config)
-56
-57        # Sample prompts.
-58        prompts = [
-59            "Hello, my name is",
-60            "The president of the United States is",
-61            "The capital of France is",
-62            "The future of AI is",
-63        ]
-64
-65        # Create a sampling params.
-66        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-67
-68        for output in llm.generate(prompts, sampling_params):
-69            print(
-70                f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-71            )
-72        llm.shutdown()
-73
-74    # Got output like
-75    # Prompt: 'Hello, my name is', Generated text: 'Jane Smith. I am a resident of the city. Can you tell me more about the public services provided in the area?'
-76    # Prompt: 'The president of the United States is', Generated text: 'considered the head of state, and the vice president of the United States is considered the head of state. President and Vice President of the United States (US)'
-77    # Prompt: 'The capital of France is', Generated text: 'located in Paris, France. The population of Paris, France, is estimated to be 2 million. France is home to many famous artists, including Picasso'
-78    # Prompt: 'The future of AI is', Generated text: 'an open and collaborative project. The project is an ongoing effort, and we invite participation from members of the community.\n\nOur community is'
-79
-80
-81if __name__ == '__main__':
-82    main()
-
-
-
- - -
- - - - - - - -
- - - -
- - - - - -
-
- -
- -
-
-
- - - - - - - - \ No newline at end of file diff --git a/latest/examples/openai_chat_client.html b/latest/examples/openai_chat_client.html index 1135866bf3..6473e9319f 100644 --- a/latest/examples/openai_chat_client.html +++ b/latest/examples/openai_chat_client.html @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -563,7 +545,7 @@ title="next page">

next

-

OpenAI Chat Client

+

OpenAI Chat Client for Multimodal

@@ -667,9 +649,9 @@ diff --git a/latest/examples/openai_chat_client_for_multimodal.html b/latest/examples/openai_chat_client_for_multimodal.html index 47b8d9561a..bb598dfcf7 100644 --- a/latest/examples/openai_chat_client_for_multimodal.html +++ b/latest/examples/openai_chat_client_for_multimodal.html @@ -9,7 +9,7 @@ - OpenAI Chat Client — TensorRT-LLM + OpenAI Chat Client for Multimodal — TensorRT-LLM @@ -35,6 +35,7 @@ + @@ -47,11 +48,17 @@ + + + + + + @@ -63,7 +70,7 @@ - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -510,124 +493,123 @@
-
-

OpenAI Chat Client#

+
+

OpenAI Chat Client for Multimodal#

Refer to the trtllm-serve documentation for starting a server.

Source NVIDIA/TensorRT-LLM.

-
  1### OpenAI Chat Client
-  2
-  3from openai import OpenAI
-  4
-  5from tensorrt_llm.inputs import encode_base64_content_from_url
-  6
-  7client = OpenAI(
-  8    base_url="http://localhost:8000/v1",
-  9    api_key="tensorrt_llm",
- 10)
- 11
- 12# SINGLE IMAGE INFERENCE
- 13response = client.chat.completions.create(
- 14    model="Qwen2.5-VL-3B-Instruct",
- 15    messages=[{
- 16        "role": "system",
- 17        "content": "you are a helpful assistant"
- 18    }, {
- 19        "role":
- 20        "user",
- 21        "content": [{
- 22            "type": "text",
- 23            "text": "Describe the natural environment in the image."
- 24        }, {
- 25            "type": "image_url",
- 26            "image_url": {
- 27                "url":
- 28                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
- 29            }
- 30        }]
- 31    }],
- 32    max_tokens=64,
- 33)
- 34print(response)
- 35
- 36# MULTI IMAGE INFERENCE
- 37response = client.chat.completions.create(
- 38    model="Qwen2.5-VL-3B-Instruct",
- 39    messages=[{
- 40        "role": "system",
- 41        "content": "you are a helpful assistant"
- 42    }, {
- 43        "role":
- 44        "user",
- 45        "content": [{
- 46            "type": "text",
- 47            "text": "Tell me the difference between two images"
- 48        }, {
- 49            "type": "image_url",
- 50            "image_url": {
- 51                "url":
- 52                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"
- 53            }
- 54        }, {
- 55            "type": "image_url",
- 56            "image_url": {
- 57                "url":
- 58                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
- 59            }
- 60        }]
- 61    }],
- 62    max_tokens=64,
- 63)
- 64print(response)
- 65
- 66# SINGLE VIDEO INFERENCE
- 67response = client.chat.completions.create(
- 68    model="Qwen2.5-VL-3B-Instruct",
- 69    messages=[{
- 70        "role": "system",
- 71        "content": "you are a helpful assistant"
- 72    }, {
- 73        "role":
- 74        "user",
- 75        "content": [{
- 76            "type": "text",
- 77            "text": "Tell me what you see in the video briefly."
- 78        }, {
- 79            "type": "video_url",
- 80            "video_url": {
- 81                "url":
- 82                "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4"
- 83            }
- 84        }]
- 85    }],
- 86    max_tokens=64,
- 87)
- 88print(response)
- 89
- 90# IMAGE EMBED INFERENCE
- 91image64 = encode_base64_content_from_url(
- 92    "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
- 93)
- 94response = client.chat.completions.create(
- 95    model="Qwen2.5-VL-3B-Instruct",
- 96    messages=[{
- 97        "role": "system",
- 98        "content": "you are a helpful assistant"
- 99    }, {
-100        "role":
-101        "user",
-102        "content": [{
-103            "type": "text",
-104            "text": "Describe the natural environment in the image."
-105        }, {
-106            "type": "image_url",
-107            "image_url": {
-108                "url": "data:image/png;base64," + image64
-109            }
-110        }]
-111    }],
-112    max_tokens=64,
-113)
-114print(response)
+
  1
+  2from openai import OpenAI
+  3
+  4from tensorrt_llm.inputs import encode_base64_content_from_url
+  5
+  6client = OpenAI(
+  7    base_url="http://localhost:8000/v1",
+  8    api_key="tensorrt_llm",
+  9)
+ 10
+ 11# SINGLE IMAGE INFERENCE
+ 12response = client.chat.completions.create(
+ 13    model="Qwen2.5-VL-3B-Instruct",
+ 14    messages=[{
+ 15        "role": "system",
+ 16        "content": "you are a helpful assistant"
+ 17    }, {
+ 18        "role":
+ 19        "user",
+ 20        "content": [{
+ 21            "type": "text",
+ 22            "text": "Describe the natural environment in the image."
+ 23        }, {
+ 24            "type": "image_url",
+ 25            "image_url": {
+ 26                "url":
+ 27                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
+ 28            }
+ 29        }]
+ 30    }],
+ 31    max_tokens=64,
+ 32)
+ 33print(response)
+ 34
+ 35# MULTI IMAGE INFERENCE
+ 36response = client.chat.completions.create(
+ 37    model="Qwen2.5-VL-3B-Instruct",
+ 38    messages=[{
+ 39        "role": "system",
+ 40        "content": "you are a helpful assistant"
+ 41    }, {
+ 42        "role":
+ 43        "user",
+ 44        "content": [{
+ 45            "type": "text",
+ 46            "text": "Tell me the difference between two images"
+ 47        }, {
+ 48            "type": "image_url",
+ 49            "image_url": {
+ 50                "url":
+ 51                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"
+ 52            }
+ 53        }, {
+ 54            "type": "image_url",
+ 55            "image_url": {
+ 56                "url":
+ 57                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
+ 58            }
+ 59        }]
+ 60    }],
+ 61    max_tokens=64,
+ 62)
+ 63print(response)
+ 64
+ 65# SINGLE VIDEO INFERENCE
+ 66response = client.chat.completions.create(
+ 67    model="Qwen2.5-VL-3B-Instruct",
+ 68    messages=[{
+ 69        "role": "system",
+ 70        "content": "you are a helpful assistant"
+ 71    }, {
+ 72        "role":
+ 73        "user",
+ 74        "content": [{
+ 75            "type": "text",
+ 76            "text": "Tell me what you see in the video briefly."
+ 77        }, {
+ 78            "type": "video_url",
+ 79            "video_url": {
+ 80                "url":
+ 81                "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4"
+ 82            }
+ 83        }]
+ 84    }],
+ 85    max_tokens=64,
+ 86)
+ 87print(response)
+ 88
+ 89# IMAGE EMBED INFERENCE
+ 90image64 = encode_base64_content_from_url(
+ 91    "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
+ 92)
+ 93response = client.chat.completions.create(
+ 94    model="Qwen2.5-VL-3B-Instruct",
+ 95    messages=[{
+ 96        "role": "system",
+ 97        "content": "you are a helpful assistant"
+ 98    }, {
+ 99        "role":
+100        "user",
+101        "content": [{
+102            "type": "text",
+103            "text": "Describe the natural environment in the image."
+104        }, {
+105            "type": "image_url",
+106            "image_url": {
+107                "url": "data:image/png;base64," + image64
+108            }
+109        }]
+110    }],
+111    max_tokens=64,
+112)
+113print(response)
 
@@ -760,9 +742,9 @@ diff --git a/latest/examples/openai_completion_client.html b/latest/examples/openai_completion_client.html index 9a3a0a0cab..3b02a30cb6 100644 --- a/latest/examples/openai_completion_client.html +++ b/latest/examples/openai_completion_client.html @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -314,58 +321,32 @@

Installation

LLM API

Examples

Blogs

@@ -549,15 +531,15 @@

previous

-

OpenAI Chat Client

+

OpenAI Chat Client for Multimodal

next

-

Layers

+

Openai Completion Client For Lora

@@ -661,9 +643,9 @@ diff --git a/latest/examples/llm_auto_parallel.html b/latest/examples/openai_completion_client_for_lora.html similarity index 70% rename from latest/examples/llm_auto_parallel.html rename to latest/examples/openai_completion_client_for_lora.html index 3dfa4b8237..b44e958f71 100644 --- a/latest/examples/llm_auto_parallel.html +++ b/latest/examples/openai_completion_client_for_lora.html @@ -9,7 +9,7 @@ - Automatic Parallelism with LLM — TensorRT-LLM + Openai Completion Client For Lora — TensorRT-LLM @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ - + + + + + + + - - + + - + @@ -314,61 +321,35 @@

Installation

LLM API

Examples

- + +
  • max_prompt_adapter_token (tensorrt_llm.llmapi.TrtLlmArgs attribute) +
  • max_prompt_embedding_table_size (tensorrt_llm.llmapi.BuildConfig attribute)

    Installation

    LLM API

    Examples

    Blogs

  • Quick Start Guide
    @@ -916,9 +905,9 @@ diff --git a/latest/installation/build-from-source-linux.html b/latest/installation/build-from-source-linux.html index a94761ee90..04e538d05e 100644 --- a/latest/installation/build-from-source-linux.html +++ b/latest/installation/build-from-source-linux.html @@ -35,6 +35,7 @@ + @@ -47,23 +48,29 @@ + + + + + + - - + + - + @@ -318,58 +325,32 @@

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    @@ -507,17 +490,32 @@
    -
    -

    Installing on Linux#

    +
    +

    Installing on Linux via pip#

    1. Install TensorRT-LLM (tested on Ubuntu 24.04).

      -
      (Optional) pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
      +

      Install prerequisites

      +

      Before the pre-built Python wheel can be installed via pip, a few +prerequisites must be put into place:

      +
      # Optional step: Only required for Blackwell and Grace Hopper
      +pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
       
      -sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
      +sudo apt-get -y install libopenmpi-dev
      +
      +
      +

      PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell and Grace Hopper GPUs. On prior GPUs, this extra installation is not required.

      +
      +

      Tip

      +

      Instead of manually installing the preqrequisites as described +above, it is also possible to use the pre-built TensorRT-LLM Develop container +image hosted on NGC +(see here for information on container tags).

      +
      +

      Install pre-built TensorRT-LLM wheel

      +

      Once all prerequisites are in place, TensorRT-LLM can be installed as follows:

      +
      pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
       
      -

      PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs. On prior GPUs, this extra installation is not required.

      -

      If using the PyTorch NGC Container image, the prerequisite steps for installing NVIDIA Blackwell-enabled PyTorch package and libopenmpi-dev are not required.

    2. Sanity check the installation by running the following in Python (tested on Python 3.12):

       1from tensorrt_llm import SamplingParams
      @@ -576,16 +574,6 @@ Please install CUDA toolkit when you see the following message when running Mode
       

      The installation of CUDA toolkit can be found in CUDA Toolkit Documentation.

    3. -
    4. Install inside the PyTorch NGC Container

      -

      The PyTorch NGC Container may lock Python package versions via the /etc/pip/constraint.txt file. When installing the pre-built TensorRT-LLM wheel inside the PyTorch NGC Container, you need to clear this file first.

      -
      [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt
      -
      -
      -

      PyTorch NGC Container typically includes a pre-installed tensorrt Python package. If there is a version mismatch between this pre-installed package and the version required by the TensorRT-LLM wheel, you will need to uninstall the existing tensorrt package before installing TensorRT-LLM.

      -
      pip uninstall -y tensorrt
      -
      -
      -
    @@ -600,12 +588,12 @@ Please install CUDA toolkit when you see the following message when running Mode diff --git a/latest/key-features.html b/latest/key-features.html index dbf7668bea..e9605c13e6 100644 --- a/latest/key-features.html +++ b/latest/key-features.html @@ -35,6 +35,7 @@ + @@ -47,11 +48,17 @@ + + + + + + @@ -63,7 +70,7 @@ - + @@ -314,58 +321,32 @@

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    @@ -511,78 +494,60 @@
    -
    -

    API Introduction#

    -

    The LLM API is a high-level Python API and designed for LLM workflows. -This API is under development and might have breaking changes in the future.

    +
    +

    LLM API Introduction#

    +

    The LLM API is a high-level Python API designed to streamline LLM inference workflows.

    +

    It supports a broad range of use cases, from single-GPU setups to multi-GPU and multi-node deployments, with built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA Dynamo and the Triton Inference Server.

    +

    While the LLM API simplifies inference workflows with a high-level interface, it is also designed with flexibility in mind. Under the hood, it uses a PyTorch-native and modular backend, making it easy to customize, extend, or experiment with the runtime.

    Supported Models#

      +
    • DeepSeek variants

    • Llama (including variants Mistral, Mixtral, InternLM)

    • GPT (including variants Starcoder-1/2, Santacoder)

    • -
    • Gemma-1/2

    • -
    • Phi-1/2/3

    • +
    • Gemma-1/2/3

    • +
    • Phi-1/2/3/4

    • ChatGLM (including variants glm-10b, chatglm, chatglm2, chatglm3, glm4)

    • -
    • QWen-1/1.5/2

    • +
    • QWen-1/1.5/2/3

    • Falcon

    • Baichuan-1/2

    • GPT-J

    • Mamba-1/2

    +
    +

    Note: For the most up-to-date list of supported models, you may refer to the TensorRT-LLM model definitions.

    +
    -
    -

    Model Preparation#

    -

    The LLM class supports input from any of following:

    -
      -
    1. Hugging Face Hub: Triggers a download from the Hugging Face model hub, such as TinyLlama/TinyLlama-1.1B-Chat-v1.0.

    2. -
    3. Local Hugging Face models: Uses a locally stored Hugging Face model.

    4. -
    5. Local TensorRT-LLM engine: Built by trtllm-build tool or saved by the Python LLM API.

    6. -
    -

    Any of these formats can be used interchangeably with the LLM(model=<any-model-path>) constructor.

    -

    The following sections describe how to use these different formats for the LLM API.

    -
    -

    Hugging Face Hub#

    -

    Using the Hugging Face Hub is as simple as specifying the repo name in the LLM constructor:

    +
    +

    Quick Start Example#

    +

    A simple inference example with TinyLlama using the LLM API:

    +

    More examples can be found here.

    +
    +
    +

    Model Input#

    +

    The LLM() constructor accepts either a Hugging Face model ID or a local model path as input.

    +
    +

    1. Using a Model from the Hugging Face Hub#

    +

    To load a model directly from the Hugging Face Model Hub, simply pass its model ID (i.e., repository name) to the LLM constructor. The model will be automatically downloaded:

    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     
    -

    You can also directly load TensorRT Model Optimizer’s quantized checkpoints on Hugging Face Hub in the same way.

    +

    You can also use quantized checkpoints (FP4, FP8, etc) of popular models provided by NVIDIA in the same way.

    -
    -

    Local Hugging Face Models#

    -

    Given the popularity of the Hugging Face model hub, the API supports the Hugging Face format as one of the starting points. -To use the API with Llama 3.1 models, download the model from the Meta Llama 3.1 8B model page by using the following command:

    +
    +

    2. Using a Local Hugging Face Model#

    +

    To use a model from local storage, first download it manually:

    git lfs install
     git clone https://huggingface.co/meta-llama/Meta-Llama-3.1-8B
     
    -

    After the model download is complete, you can load the model:

    -
    llm = LLM(model=<path_to_meta_llama_from_hf>)
    +

    Then, load the model by specifying a local directory path:

    +
    llm = LLM(model=<local_path_to_model>)
     
    -

    Using this model is subject to a particular license. Agree to the terms and authenticate with Hugging Face to begin the download.

    -
    -
    -

    Local TensorRT-LLM Engine#

    -

    There are two ways to build a TensorRT-LLM engine:

    -
      -
    1. You can build the TensorRT-LLM engine from the Hugging Face model directly with the trtllm-build tool and then save the engine to disk for later use. -Refer to the README in the examples/models/core/llama repository on GitHub.

      -

      After the engine building is finished, we can load the model:

      -
      llm = LLM(model=<path_to_trt_engine>)
      -
      -
      -
    2. -
    3. Use an LLM instance to create the engine and persist to local disk:

      -
      llm = LLM(<model-path>)
      -
      -# Save engine to local disk
      -llm.save(<engine-dir>)
      -
      -
      -

      The engine can be loaded using the model argument as shown in the first approach.

      -
    4. -
    +
    +

    Note: Some models require accepting specific license agreements. Make sure you have agreed to the terms and authenticated with Hugging Face before downloading.

    +
    @@ -623,12 +588,12 @@ Refer to the

    previous

    -

    Installing on Grace Hopper

    +

    Building from Source Code on Linux

  • +
  • CudaGraphConfig +
  • LookaheadDecodingConfig

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs

    Installation

    LLM API

    Examples

    Blogs