LLM API with TensorRT Engine#
+A simple inference example with TinyLlama using the LLM API:
+For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this README.
+From 03a1561a273beb257123c89210eb9209d2f5da48 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 4 Sep 2025 03:19:11 +0000 Subject: [PATCH] Update GitHub pages in root to v1.1.0rc3 --- .buildinfo | 2 +- _cpp_gen/executor.html | 5954 ++--- _cpp_gen/runtime.html | 19714 ++++++++-------- .../attention.py | 12 +- .../model_engine.py | 51 +- _modules/index.html | 12 +- _modules/tensorrt_llm/builder.html | 12 +- .../tensorrt_llm/disaggregated_params.html | 12 +- _modules/tensorrt_llm/executor/result.html | 12 +- _modules/tensorrt_llm/executor/utils.html | 12 +- _modules/tensorrt_llm/functional.html | 12 +- _modules/tensorrt_llm/layers/activation.html | 12 +- _modules/tensorrt_llm/layers/attention.html | 12 +- _modules/tensorrt_llm/layers/cast.html | 12 +- _modules/tensorrt_llm/layers/conv.html | 12 +- _modules/tensorrt_llm/layers/embedding.html | 12 +- _modules/tensorrt_llm/layers/linear.html | 12 +- _modules/tensorrt_llm/layers/mlp.html | 12 +- .../tensorrt_llm/layers/normalization.html | 12 +- _modules/tensorrt_llm/layers/pooling.html | 12 +- _modules/tensorrt_llm/llmapi/build_cache.html | 12 +- _modules/tensorrt_llm/llmapi/llm.html | 12 +- _modules/tensorrt_llm/llmapi/llm_args.html | 12 +- _modules/tensorrt_llm/llmapi/mm_encoder.html | 12 +- _modules/tensorrt_llm/llmapi/mpi_session.html | 12 +- .../tensorrt_llm/models/baichuan/model.html | 12 +- _modules/tensorrt_llm/models/bert/model.html | 12 +- _modules/tensorrt_llm/models/bloom/model.html | 12 +- .../tensorrt_llm/models/chatglm/config.html | 12 +- .../tensorrt_llm/models/chatglm/model.html | 12 +- _modules/tensorrt_llm/models/clip/model.html | 12 +- .../tensorrt_llm/models/cogvlm/config.html | 12 +- .../tensorrt_llm/models/cogvlm/model.html | 12 +- .../tensorrt_llm/models/commandr/model.html | 12 +- _modules/tensorrt_llm/models/dbrx/config.html | 12 +- _modules/tensorrt_llm/models/dbrx/model.html | 12 +- .../models/deepseek_v1/model.html | 12 +- .../models/deepseek_v2/model.html | 12 +- _modules/tensorrt_llm/models/dit/model.html | 12 +- _modules/tensorrt_llm/models/eagle/model.html | 12 +- .../tensorrt_llm/models/enc_dec/model.html | 12 +- .../tensorrt_llm/models/falcon/config.html | 12 +- .../tensorrt_llm/models/falcon/model.html | 12 +- .../tensorrt_llm/models/gemma/config.html | 12 +- _modules/tensorrt_llm/models/gemma/model.html | 12 +- _modules/tensorrt_llm/models/gpt/config.html | 12 +- _modules/tensorrt_llm/models/gpt/model.html | 12 +- _modules/tensorrt_llm/models/gptj/config.html | 12 +- _modules/tensorrt_llm/models/gptj/model.html | 12 +- .../tensorrt_llm/models/gptneox/model.html | 12 +- .../tensorrt_llm/models/llama/config.html | 12 +- _modules/tensorrt_llm/models/llama/model.html | 12 +- _modules/tensorrt_llm/models/mamba/model.html | 12 +- .../tensorrt_llm/models/medusa/config.html | 12 +- .../tensorrt_llm/models/medusa/model.html | 12 +- .../tensorrt_llm/models/mllama/model.html | 12 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 12 +- .../tensorrt_llm/models/modeling_utils.html | 12 +- _modules/tensorrt_llm/models/mpt/model.html | 12 +- .../models/multimodal_encoders/config.html | 12 +- .../models/multimodal_encoders/model.html | 12 +- _modules/tensorrt_llm/models/opt/model.html | 12 +- _modules/tensorrt_llm/models/phi/model.html | 12 +- _modules/tensorrt_llm/models/phi3/model.html | 12 +- .../models/recurrentgemma/model.html | 12 +- .../tensorrt_llm/models/redrafter/model.html | 12 +- _modules/tensorrt_llm/plugin/plugin.html | 12 +- _modules/tensorrt_llm/quantization/mode.html | 12 +- .../quantization/quantize_by_modelopt.html | 12 +- .../runtime/enc_dec_model_runner.html | 12 +- _modules/tensorrt_llm/runtime/generation.html | 12 +- .../runtime/kv_cache_manager.html | 12 +- .../tensorrt_llm/runtime/model_runner.html | 12 +- .../runtime/model_runner_cpp.html | 12 +- .../runtime/multimodal_model_runner.html | 12 +- _modules/tensorrt_llm/runtime/session.html | 12 +- _modules/tensorrt_llm/sampling_params.html | 12 +- _sources/_cpp_gen/executor.rst.txt | 42 +- _sources/_cpp_gen/runtime.rst.txt | 328 +- _sources/examples/llm_runtime.rst.txt | 2 +- _sources/index.rst.txt | 6 + _sources/legacy/tensorrt_quickstart.md.txt | 9 + _sources/llm-api/reference.rst.txt | 4 +- advanced/disaggregated-service.html | 12 +- advanced/executor.html | 12 +- advanced/expert-parallelism.html | 12 +- advanced/gpt-attention.html | 12 +- advanced/gpt-runtime.html | 12 +- advanced/graph-rewriting.html | 12 +- advanced/kv-cache-management.html | 12 +- advanced/kv-cache-reuse.html | 12 +- advanced/lora.html | 12 +- advanced/lowprecision-pcie-allreduce.html | 12 +- advanced/open-sourced-cutlass-kernels.html | 12 +- advanced/speculative-decoding.html | 12 +- advanced/weight-streaming.html | 12 +- architecture/add-model.html | 12 +- architecture/checkpoint.html | 12 +- architecture/core-concepts.html | 12 +- architecture/model-weights-loader.html | 12 +- architecture/overview.html | 12 +- architecture/workflow.html | 12 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 12 +- blogs/Falcon180B-H200.html | 12 +- blogs/H100vsA100.html | 12 +- blogs/H200launch.html | 12 +- blogs/XQA-kernel.html | 12 +- blogs/quantization-in-TRT-LLM.html | 12 +- .../blog10_ADP_Balance_Strategy.html | 12 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 12 +- ...1_MTP_Implementation_and_Optimization.html | 12 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 12 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 12 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 12 +- .../blog6_Llama4_maverick_eagle_guide.html | 12 +- ...formance_Analysis_And_Auto_Enablement.html | 12 +- ...ert_Parallelism_in_TensorRT-LLM_part2.html | 12 +- .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 22 +- commands/trtllm-bench.html | 12 +- commands/trtllm-build.html | 12 +- commands/trtllm-serve/index.html | 12 +- .../run-benchmark-with-trtllm-serve.html | 12 +- commands/trtllm-serve/trtllm-serve.html | 12 +- ...tart-recipe-for-deepseek-r1-on-trtllm.html | 12 +- ...ck-start-recipe-for-gpt-oss-on-trtllm.html | 12 +- ...art-recipe-for-llama3.3-70b-on-trtllm.html | 12 +- ...art-recipe-for-llama4-scout-on-trtllm.html | 12 +- dev-on-cloud/build-image-to-dockerhub.html | 12 +- dev-on-cloud/dev-on-runpod.html | 12 +- examples/curl_chat_client.html | 12 +- examples/curl_chat_client_for_multimodal.html | 12 +- examples/curl_completion_client.html | 12 +- examples/customization.html | 12 +- examples/deepseek_r1_reasoning_parser.html | 12 +- examples/genai_perf_client.html | 12 +- .../genai_perf_client_for_multimodal.html | 12 +- examples/index.html | 80 +- examples/llm_api_examples.html | 12 +- examples/llm_guided_decoding.html | 12 +- examples/llm_inference.html | 12 +- examples/llm_inference_async.html | 12 +- examples/llm_inference_async_streaming.html | 12 +- examples/llm_inference_distributed.html | 12 +- examples/llm_kv_cache_connector.html | 12 +- examples/llm_logits_processor.html | 12 +- examples/llm_mgmn_llm_distributed.html | 12 +- examples/llm_mgmn_trtllm_bench.html | 12 +- examples/llm_mgmn_trtllm_serve.html | 12 +- examples/llm_multilora.html | 12 +- examples/llm_runtime.html | 137 +- examples/llm_sampling.html | 12 +- examples/llm_speculative_decoding.html | 12 +- examples/openai_chat_client.html | 12 +- .../openai_chat_client_for_multimodal.html | 12 +- examples/openai_completion_client.html | 12 +- .../openai_completion_client_for_lora.html | 12 +- .../openai_completion_client_json_schema.html | 12 +- examples/trtllm_serve_examples.html | 12 +- genindex.html | 12 +- index.html | 14 +- installation/build-from-source-linux.html | 12 +- installation/containers.html | 14 +- installation/linux.html | 80 +- key-features.html | 12 +- legacy/tensorrt_quickstart.html | 662 + llm-api/index.html | 12 +- llm-api/reference.html | 12 +- objects.inv | Bin 168791 -> 168862 bytes overview.html | 12 +- performance/perf-analysis.html | 12 +- performance/perf-benchmarking.html | 12 +- performance/perf-overview.html | 12 +- .../benchmarking-default-performance.html | 12 +- .../deciding-model-sharding-strategy.html | 12 +- .../fp8-quantization.html | 12 +- .../performance-tuning-guide/index.html | 12 +- ...ing-max-batch-size-and-max-num-tokens.html | 12 +- .../useful-build-time-flags.html | 12 +- .../useful-runtime-flags.html | 12 +- py-modindex.html | 12 +- python-api/tensorrt_llm.functional.html | 12 +- python-api/tensorrt_llm.layers.html | 12 +- python-api/tensorrt_llm.models.html | 12 +- python-api/tensorrt_llm.plugin.html | 12 +- python-api/tensorrt_llm.quantization.html | 12 +- python-api/tensorrt_llm.runtime.html | 12 +- quick-start-guide.html | 80 +- reference/ci-overview.html | 12 +- reference/dev-containers.html | 12 +- reference/memory.html | 12 +- .../multimodal-feature-support-matrix.html | 12 +- reference/precision.html | 12 +- reference/support-matrix.html | 12 +- reference/troubleshooting.html | 12 +- release-notes.html | 12 +- search.html | 12 +- searchindex.js | 2 +- torch.html | 80 +- torch/adding_new_model.html | 12 +- torch/arch_overview.html | 12 +- torch/attention.html | 12 +- .../benchmarking_with_trtllm_bench.html | 12 +- torch/auto_deploy/advanced/example_run.html | 12 +- .../advanced/expert_configurations.html | 12 +- torch/auto_deploy/advanced/logging.html | 12 +- .../advanced/serving_with_trtllm_serve.html | 12 +- torch/auto_deploy/advanced/workflow.html | 12 +- torch/auto_deploy/auto-deploy.html | 12 +- torch/auto_deploy/support_matrix.html | 12 +- torch/features/checkpoint_loading.html | 12 +- .../features/feature_combination_matrix.html | 12 +- torch/features/lora.html | 12 +- torch/features/overlap_scheduler.html | 12 +- torch/features/quantization.html | 12 +- torch/features/sampling.html | 12 +- torch/kv_cache_manager.html | 12 +- torch/scheduler.html | 12 +- 217 files changed, 15608 insertions(+), 14027 deletions(-) create mode 100644 _sources/legacy/tensorrt_quickstart.md.txt create mode 100644 legacy/tensorrt_quickstart.html diff --git a/.buildinfo b/.buildinfo index 019cc96b09..829e44b0d9 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 6e42667ce0c3f76b3f7a51cbd2d67bd7 +config: 57da472845e6079ef61d1a59a2a83dc9 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html index 945ebf5a38..556f375622 100644 --- a/_cpp_gen/executor.html +++ b/_cpp_gen/executor.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@
Use TensorRT Engine
+ @@ -523,8 +527,8 @@Public Functions
-Constructs a DisaggExecutorOrchestrator object.
-ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
-requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
-requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
-timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
-Await for generation responses.
-timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
-Indicates if the current process is allowed to enqueueRequests.
-Get context executors.
-Get generation executors.
-Private Members
-Public Functions
-Public Types
-Public Types
-Public Functions
-Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns the memory type of the buffer.
-Returns the number of elements in the tensor.
-Returns the size of the tensor in bytes.
-Set the entire memory to zero.
-stream – Must be a valid CUDA stream if the memory type is GPU.
-Copy the data and shape from another tensor.
-other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
-Allocate a cpu tensor with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
-shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
-Typedefs
-Public Static Functions
-Public Functions
+Public Functions
+Public Functions
+Constructs a DisaggExecutorOrchestrator object.
+ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
+requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
+requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
+timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
+Await for generation responses.
+timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
+Indicates if the current process is allowed to enqueueRequests.
+Get context executors.
+Get generation executors.
+Private Members
+Public Functions
+Public Types
+Public Types
+Public Functions
+Returns a pointer to underlying array.
+Returns a pointer to underlying array.
+Returns the memory type of the buffer.
+Returns the number of elements in the tensor.
+Returns the size of the tensor in bytes.
+Set the entire memory to zero.
+stream – Must be a valid CUDA stream if the memory type is GPU.
+Copy the data and shape from another tensor.
+other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
+Allocate a cpu tensor with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
+shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
+Public Static Functions
+Public Functions
-Public Functions
-Functions
+Utility function to print a shape.
+Utility function to print a tensor with its shape.
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+tensorPtr – A possibly null shared ptr.
+A pointer to T const, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+tensorPtr – A possibly null shared ptr.
+A pointer to T, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to const T, possibly nullptr.
+Public Types
+ +Public Functions
Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
+Sets the tensor dimensions. The new size of the tensor will be volume(dims)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+Removes the given unit dimensions from this tensor.
+Adds a unit dimension at the specified position.
+Public Members
-Public Static Functions
+Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.
Returns the volume of the dimensions. Throws if d.nbDims < 0.
Returns the strides of each dimemsion in a Shape.
+Removes the given unit dimension from shape.
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
A new shape without the unit dimension.
+Add a unit dimension to shape at the specified position.
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
A new shape with the added unit dimension.
+Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
A view on the buffer.
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
Whenever – offset overflows or the last dimension offset+size overflows.
+A view of shape [size, the rest dimensions] or [size] when
+return the rest slices at the last dimension when size omitted.
offsetDims – specifies all dimensions.
+Just the block at the point, with shape of [the rest dimensions] or [1] when
+Returns a view on the underlying buffer (or tensor) with the given shape.
tensor – The tensor to view.
shape – The shape of the view.
A view on the tensor.
Returns a view on the underlying tensor which can be independently reshaped.
tensor – The tensor to view.
+A view on the tensor.
Returns a flattened view on the underlying tensor which can be independently reshaped.
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
A flatten view on the tensor.
Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.
data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.
An ITensor.
A convenience function to create a tensor shape with the given dimensions.
+A convenience function for converting a tensor shape to a string.
A convenience function to compare shapes.
+A convenience function to compare shapes.
+Protected Functions
+Friends
+Public Types
+Public Functions
+Creates a new cuda event. The event will be destroyed in the destructor.
+flags – Flags for event creation. By default, event timing is disabled.
+Pass an existing cuda event to this object.
+event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
Synchronizes the event.
+Private Types
+ + +Functions
+Public Types
+ + +Public Functions
+Private Types
Private Members
+Public Functions
+CudaVirtualMemoryAllocator::Configuration
+manager – Manager used to track and manage virtual memories
tag – The tag for allocated memories
mode – Backed storage mode
backStream – The CUDA stream used for restoring memory content Note: Virtual Address Allocation is not async. The stream is not used in allocation.
Public Static Attributes
+Private Functions
+Private Members
+Friends
+CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation, providing the ability to release and rematerialize the allocation.
+Public Types
+Values:
+Public Functions
Materialize this CUDAVirtualMemoryChunk. Shall be called only when status() == RELEASED.
+Calls creator.create(), and then configurator.setup() for each configurator in order.
+Stop at the first thrown exception and propagates it.
+Release this CUDAVirtualMemoryChunk. Shall be called only when status() == MATERIALIZED, or materialize() throws. Will be called automatically by destructor if necessary.
+Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order, and then creator.release().
+Never stops early upon exception. The last thrown exception will be propagated, and others logged.
+Test if this CUDAVirtualMemoryChunk is managing a memory block.
+Private Functions
+Private Members
+Private Static Attributes
+CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle:
Map into virtual address
Bind to multicast object
Backup and restore memory content
Subclassed by tensorrt_llm::runtime::MemsetConfigurator, tensorrt_llm::runtime::MulticastConfigurator, tensorrt_llm::runtime::OffloadConfigurator, tensorrt_llm::runtime::UnicastConfigurator
+Public Functions
+CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle, either by creating one locally, or importing one from remote.
+Subclassed by tensorrt_llm::runtime::LocalCreator< count >
+ +Public Functions
+Add memory to be managed by this manager.
+The memory and internal state will remain valid if any exception is thrown.
+ +handle – Unique handle provided to reference this memory in remove.
tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.
memory – The CUDAVirtualMemory object.
Creates and adds memory to be managed by this manager. The created memory is automatically materialized.
+The internal state will remain valid if any exception is thrown.
+ +handle – Unique handle provided to reference this memory in remove.
tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.
creator – The creator for the memory.
configurators – The configurators for the memory.
Remove the memory from the manager.
+handle – The handle provided to add.
The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned.
+Call release for CUDAVirtualMemoryChunk
objects with a given tag.
+This function will always call
+CUDAVirtualMemoryChunk::release on all selected objects. The last exception thrown by CUDAVirtualMemoryChunk::release will be rethrown, and others will be logged.
+If any CUDAVirtualMemoryChunk threw an exception during release, it will be removed from the manager. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
tag – the tag to select target memories.
+Number of objects selected.
+Call materialize for CUDAVirtualMemoryChunk
objects with a given tag.
+This function will stop at the first
+CUDAVirtualMemoryChunk::materialize that throws exception, and attempt to roll back previous successful materialize by calling release. The exception thrown by CUDAVirtualMemoryChunk::materialize will be rethrown, and any exception thrown by release will be logged.
+If any CUDAVirtualMemoryChunk threw an exception during materialize or release, it will be removed from the manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
tag – the tag to select target memories.
+Number of objects selected.
+Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list. The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened. This method is only for diagnostic purpose, and should not be called concurrently with other methods.
+The handle list.
+Private Types
+ + +Private Functions
+Private Members
+LocalCreator creates memory allocation locally through cuMemCreate.
+ +MemsetConfigurator fills the memory with given value.
+Public Functions
+MulticastConfigurator binds the allocation handle to the given multicast object and offset.
+ + +OffloadConfigurator offload the content of the allocation to the backup storage when teardown, and restore the content on the following setup.
+Public Functions
+Public Members
+UnicastConfigurator maps the allocation handle into the specified unicast address range.
+Public Functions
+Subclassed by tensorrt_llm::runtime::EagleModule, tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
+Public Functions
+max number of draft tokens that can be accepted by one step of the decoder
++one more than draft path len for prediction from primary head
+max number of tokens that a request can grow in one step of the decoder
+max number of draft tokens processed by one step of the decoder
++one more than decoding draft tokens for prediction from primary head
+max number of tokens processed by one step of the decoder
+Private Functions
+Private Members
+Defines
+Typedefs
+Enums
+ + +Public Functions
+Defines
+Public Functions
+Public Members
+Private Types
+Private Functions
+ + +Typedefs
- - -Enums
- - -Functions
- - - - -Gets a typed pointer to the constant underlying data of the buffer.
-T – The type of the underlying data.
-buffer – The buffer to get a pointer to.
-A pointer to constant T.
Gets a typed pointer to the underlying data of the buffer.
-T – The type of the underlying data.
-buffer – The buffer to get a pointer to.
-A pointer to T.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-T – The type of the underlying data.
-bufferPtr – A possibly null shared ptr.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-T – The type of the underlying data.
-bufferPtr – A possibly null shared ptr.
-A pointer to const T, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to const T, possibly nullptr.
-A wrapper around nvinfer1::DataType that provides a support for pointer types.
Public Functions
-Public Static Attributes
- - -Public Types
-For converting a TensorRT data type to a C++ data type.
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Subclassed by tensorrt_llm::runtime::ITensor
-Public Types
- - -Public Functions
-Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns a pointer to the underlying array at a given element index.
-Returns a pointer to the underlying array at a given element index.
-Returns the size (in number of elements) of the buffer.
-Returns the size (in bytes) of the buffer.
-Returns the capacity of the buffer.
-Returns the memory type of the buffer.
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-Releases the buffer. It will be reset to nullptr.
-Public Static Functions
- - -Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer.
Returns a view on the underlying tensor which can be independently resized.
tensor – The tensor to view.
-A view on the tensor.
Returns a view on the underlying tensor with a different size.
tensor – The tensor to view.
size – The size of the view.
A view on the tensor.
Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.
data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.
An IBuffer.
Determine the memory type of a pointer.
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-For converting a C++ data type to a TensorRT data type.
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Private Static Attributes
-Public Static Attributes
-Public Functions
-Public Members
-Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]
-The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.
New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
-A Vector of views on newTokensSteps for each token [BS, BM].
-Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
-Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
-Public Static Attributes
-Public Functions
-Public Types
- - -Public Functions
- - -Public Types
- - -Public Functions
-A helper class for managing memory on host and device.
+#include <iGptDecoderBatched.h> +GPT decoder class with support for in-flight batching.
+Subclassed by tensorrt_llm::runtime::GptDecoderBatched
Public Types
Public Functions
Construct a BufferManager.
+Setup the decoder before calling forward()
Disable Lookahead decoding.
+Run one step for all requests without blocking the host process and return the token for synchronization.
+Run one step for all requests and wait for completion on the host.
+Gather final beam search results for request batchIdx. Result will only be available after event returned.
Public Types
+Public Functions
+Public Members
+[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+Mandatory parameters Logits
+Maximum number of decoding tokens of active slots.
+Public Functions
+Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
Destructor.
-Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
Pass an existing cuda stream to this object.
+stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
Allocates an IBuffer of the given size and memory type.
Construct with an existing cuda stream or the default stream by passing nullptr.
Allocates an ITensor of the given dimensions and memory type.
Returns the device on which the stream was created.
Create an empty IBuffer of the given memory type. It may be resized later.
Returns the stream associated with this object.
Create an empty ITensor of the given memory type. It may be reshaped later.
Synchronizes the stream.
Set the contents of the given buffer to value.
Record an event on the stream.
Set the contents of the given buffer to zero.
Record an event on the stream.
Copy src to dst.
Wait for an event.
Copy src to dst.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Get the underlying cuda stream.
-The current size of the memory reserved by the memory pool.
-The current size of the memory used by the memory pool.
-The current size of the memory free in the memory pool.
-Public Static Functions
-Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
Allocates an IBuffer of the given size on the CPU.
Allocates an ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU.
Allocates a pinned ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
Allocates an IBuffer of the given size in UVM.
Allocates an ITensor of the given dimensions in UVM.
Allocates an ITensor of the given dimensions for NVLS.
Public Static Attributes
-Friends
-Public Functions
Public Static Functions
-Private Members
Public Functions
-Public Members
-Public Types
- - -Public Functions
-Private Members
- - -Functions
-Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
-Public Types
-Public Functions
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-Private Members
-Subclassed by tensorrt_llm::runtime::GptDecoder< T >
-Public Types
- - -Public Functions
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-Public Static Functions
-Public Types
-Public Functions
-Public Members
- - -Private Functions
-Private Members
-Public Members
- - - - - - - - - - - - - - -Public Functions
-Public Members
- - - - - - - - -[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
-Functions
-Public Types
- - -Public Functions
-Private Types
-Private Members
-Public Functions
-CudaVirtualMemoryAllocator::Configuration
-manager – Manager used to track and manage virtual memories
tag – The tag for allocated memories
mode – Backed storage mode
backStream – The CUDA stream used for restoring memory content Note: Virtual Address Allocation is not async. The stream is not used in allocation.
Public Static Attributes
-Private Functions
-Private Members
-Friends
-CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation, providing the ability to release and rematerialize the allocation.
-Public Types
-Values:
-Public Functions
- - -Materialize this CUDAVirtualMemoryChunk. Shall be called only when status() == RELEASED.
-Calls creator.create(), and then configurator.setup() for each configurator in order.
-Stop at the first thrown exception and propagates it.
-Release this CUDAVirtualMemoryChunk. Shall be called only when status() == MATERIALIZED, or materialize() throws. Will be called automatically by destructor if necessary.
-Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order, and then creator.release().
-Never stops early upon exception. The last thrown exception will be propagated, and others logged.
-Test if this CUDAVirtualMemoryChunk is managing a memory block.
-Private Functions
-Private Members
-Private Static Attributes
-CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle:
Map into virtual address
Bind to multicast object
Backup and restore memory content
Subclassed by tensorrt_llm::runtime::MemsetConfigurator, tensorrt_llm::runtime::MulticastConfigurator, tensorrt_llm::runtime::OffloadConfigurator, tensorrt_llm::runtime::UnicastConfigurator
-Public Functions
-CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle, either by creating one locally, or importing one from remote.
-Subclassed by tensorrt_llm::runtime::LocalCreator< count >
- -Public Functions
-Add memory to be managed by this manager.
-The memory and internal state will remain valid if any exception is thrown.
- -handle – Unique handle provided to reference this memory in remove.
tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.
memory – The CUDAVirtualMemory object.
Creates and adds memory to be managed by this manager. The created memory is automatically materialized.
-The internal state will remain valid if any exception is thrown.
- -handle – Unique handle provided to reference this memory in remove.
tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.
creator – The creator for the memory.
configurators – The configurators for the memory.
Remove the memory from the manager.
-handle – The handle provided to add.
The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned.
-Call release for CUDAVirtualMemoryChunk
objects with a given tag.
-This function will always call
-CUDAVirtualMemoryChunk::release on all selected objects. The last exception thrown by CUDAVirtualMemoryChunk::release will be rethrown, and others will be logged.
-If any CUDAVirtualMemoryChunk threw an exception during release, it will be removed from the manager. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
tag – the tag to select target memories.
-Number of objects selected.
-Call materialize for CUDAVirtualMemoryChunk
objects with a given tag.
-This function will stop at the first
-CUDAVirtualMemoryChunk::materialize that throws exception, and attempt to roll back previous successful materialize by calling release. The exception thrown by CUDAVirtualMemoryChunk::materialize will be rethrown, and any exception thrown by release will be logged.
-If any CUDAVirtualMemoryChunk threw an exception during materialize or release, it will be removed from the manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
tag – the tag to select target memories.
-Number of objects selected.
-Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list. The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened. This method is only for diagnostic purpose, and should not be called concurrently with other methods.
-The handle list.
-Private Types
- - -Private Functions
-Private Members
-LocalCreator creates memory allocation locally through cuMemCreate.
- - -MemsetConfigurator fills the memory with given value.
-Public Functions
-MulticastConfigurator binds the allocation handle to the given multicast object and offset.
- - -OffloadConfigurator offload the content of the allocation to the backup storage when teardown, and restore the content on the following setup.
-Public Functions
-UnicastConfigurator maps the allocation handle into the specified unicast address range.
-Public Functions
-Public Types
-Public Functions
-Public Members
-Private Functions
-Public Members
- - - - - - - - - - - - - - - - - - - - - - - - -Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
-Public Functions
-Public Members
- - - - - - - - -[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
-Functions
-Utility function to print a shape.
-Utility function to print a tensor with its shape.
-Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-tensorPtr – A possibly null shared ptr.
-A pointer to T const, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-tensorPtr – A possibly null shared ptr.
-A pointer to T, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to const T, possibly nullptr.
-Public Types
- - -Public Functions
-Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
-Sets the tensor dimensions. The new size of the tensor will be volume(dims)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-Removes the given unit dimensions from this tensor.
-Adds a unit dimension at the specified position.
-Public Static Functions
-Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.
Returns the volume of the dimensions. Throws if d.nbDims < 0.
Returns the strides of each dimemsion in a Shape.
-Removes the given unit dimension from shape.
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
A new shape without the unit dimension.
-Add a unit dimension to shape at the specified position.
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
A new shape with the added unit dimension.
-Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
A view on the buffer.
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
Whenever – offset overflows or the last dimension offset+size overflows.
-A view of shape [size, the rest dimensions] or [size] when
-return the rest slices at the last dimension when size omitted.
offsetDims – specifies all dimensions.
-Just the block at the point, with shape of [the rest dimensions] or [1] when
-Returns a view on the underlying buffer (or tensor) with the given shape.
tensor – The tensor to view.
shape – The shape of the view.
A view on the tensor.
Returns a view on the underlying tensor which can be independently reshaped.
tensor – The tensor to view.
-A view on the tensor.
Returns a flattened view on the underlying tensor which can be independently reshaped.
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
A flatten view on the tensor.
Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.
data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.
An ITensor.
A convenience function to create a tensor shape with the given dimensions.
-A convenience function for converting a tensor shape to a string.
A convenience function to compare shapes.
-A convenience function to compare shapes.
-Protected Functions
-Friends
-Defines
-Typedefs
-Enums
- - -Public Functions
-Functions
-Configuration for LoraCachePageManager
-See LoraCache docs for description of pages, slots, and page blocks.
-Public Functions
-Private Members
-Public Functions
-Is my rank the last rank in its pipeline?
-Public Static Functions
-Public Static Attributes
-Private Members
-Functions
-Public Types
-Values:
-Public Functions
-Public Static Functions
-Private Members
-Public Types
-Public Functions
-Public Static Functions
-Private Functions
-Private Members
-Private Static Attributes
-Public Types
-Public Functions
-Creates a new cuda event. The event will be destroyed in the destructor.
-flags – Flags for event creation. By default, event timing is disabled.
-Pass an existing cuda event to this object.
-event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
Synchronizes the event.
-Private Types
- - -Represents the inputs to the decoder.
-This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
-Public Types
-Public Functions
-Public Members
-Mandatory parameters The index of the decoding step we are on. Only used in Python runtime
-The maximum number of tokens to decode.
-The maximum length of the attention window to consider while decoding.
-The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3.
-The number of samples in the batch.
-The beam widths of each request, [batchSize].
-The maximum value in the stopWordsLens tensor.
The maximum value in the badWordsLens tensor.
The output of the model forward computation, a probability distribution over the vocabulary [batchSize][numGenTokens, beamWidth, vocabSizePadded] on gpu
-The end ids, [batchSize * beamWidth] on gpu.
-Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t.
-Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu
-The maximum sequence length for each sequence in the batch, [batchSize] on gpu.
-Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu
-Steps of each request, for Variable-Beam-Width-Search, [batchSize].
-Public Members
-Public Members
-Public Members
- - - - - - - - - - - - - - - - - - - - -Public Members
-[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
-[batchSize, maxTokensPerStep], on gpu
-[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
-[batchSize], on gpu
-Subclassed by tensorrt_llm::runtime::EagleModule, tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
-Public Functions
-max number of draft tokens that can be accepted by one step of the decoder
--one more than draft path len for prediction from primary head
-max number of tokens that a request can grow in one step of the decoder
-max number of draft tokens processed by one step of the decoder
--one more than decoding draft tokens for prediction from primary head
-max number of tokens processed by one step of the decoder
-Private Functions
-Private Members
-GPT decoder class with support for in-flight batching.
-Subclassed by tensorrt_llm::runtime::GptDecoderBatched
-Public Types
-Public Functions
-Setup the decoder before calling forward()
Disable Lookahead decoding.
-Run one step for all requests without blocking the host process and return the token for synchronization.
-Run one step for all requests and wait for completion on the host.
-Gather final beam search results for request batchIdx. Result will only be available after event returned.
Public Types
-Public Functions
-Public Members
-[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-Mandatory parameters Logits
-Maximum number of decoding tokens of active slots.
-Public Functions
-Private Members
-GPT decoder class with support for in-flight batching.
-Public Types
-Public Functions
-Setup the decoder before calling forward()
Disable Lookahead decoding.
-Run one step for all requests without blocking the host process and return the token for synchronization.
-Run one step for all requests and wait for completion on the host.
-Gather final beam search results for request batchSlot. Result will only be available after event returned.
Private Types
-Private Functions
-Calls decoders for tokens per engine step.
-Private Members
-Public Functions
-Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
-flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
Pass an existing cuda stream to this object.
-stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
Construct with an existing cuda stream or the default stream by passing nullptr.
-Returns the device on which the stream was created.
-Returns the stream associated with this object.
-Synchronizes the stream.
-Private Types
- - -Functions
-Public Members
-Defines
-Public Functions
-Public Members
-Private Types
-Private Functions
- - -Public Functions
+Public Types
+ + +Public Functions
+Public Members
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Public Functions
+Private Members
+Public Functions
+Public Members
+Public Functions
+Public Members
+Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]
+The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.
New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
+A Vector of views on newTokensSteps for each token [BS, BM].
+Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
+Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
+Public Static Attributes
+Public Functions
+Represents the inputs to the decoder.
+This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
+Public Types
+Public Functions
+Public Members
+Mandatory parameters The index of the decoding step we are on. Only used in Python runtime
+The maximum number of tokens to decode.
+The maximum length of the attention window to consider while decoding.
+The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3.
+The number of samples in the batch.
+The beam widths of each request, [batchSize].
+The maximum value in the stopWordsLens tensor.
The maximum value in the badWordsLens tensor.
The output of the model forward computation, a probability distribution over the vocabulary [batchSize][numGenTokens, beamWidth, vocabSizePadded] on gpu
+The end ids, [batchSize * beamWidth] on gpu.
+Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t.
+Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu
+The maximum sequence length for each sequence in the batch, [batchSize] on gpu.
+Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu
+Steps of each request, for Variable-Beam-Width-Search, [batchSize].
+Public Members
+Public Members
+Public Members
+ + + + + + + + + + + + + + + + + + + + +Public Members
+[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
+[batchSize, maxTokensPerStep], on gpu
+[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
+[batchSize], on gpu
+Public Functions
+Is my rank the last rank in its pipeline?
+Public Static Functions
+Public Static Attributes
+Private Members
+GPT decoder class with support for in-flight batching.
+Public Types
+Public Functions
+Setup the decoder before calling forward()
Disable Lookahead decoding.
+Run one step for all requests without blocking the host process and return the token for synchronization.
+Run one step for all requests and wait for completion on the host.
+Gather final beam search results for request batchSlot. Result will only be available after event returned.
Private Types
+Private Functions
+Calls decoders for tokens per engine step.
+Private Members
+Public Types
+Public Functions
+Public Members
+Private Functions
+Public Members
+ + + + + + + + + + + + + + + + + + + + + + + + +Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
+Public Functions
+Public Members
+ + + + + + + + +[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
+A helper class for managing memory on host and device.
+Public Types
+ + + + +Public Functions
+Construct a BufferManager.
+cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+Destructor.
+Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
Allocates an IBuffer of the given size and memory type.
Allocates an ITensor of the given dimensions and memory type.
Create an empty IBuffer of the given memory type. It may be resized later.
Create an empty ITensor of the given memory type. It may be reshaped later.
Set the contents of the given buffer to value.
Copy src to dst.
Copy src to dst.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Get the underlying cuda stream.
+The current size of the memory reserved by the memory pool.
+The current size of the memory used by the memory pool.
+The current size of the memory free in the memory pool.
+Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
Public Static Functions
+Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
Allocates an IBuffer of the given size on the CPU.
Allocates an ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU.
Allocates a pinned ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
Allocates an IBuffer of the given size in UVM.
Allocates an ITensor of the given dimensions in UVM.
Allocates an ITensor of the given dimensions for NVLS.
Private Members
+Friends
+Functions
+Public Types
+Values:
+Public Functions
+Public Static Functions
+Private Members
+Public Types
+Public Functions
+Public Members
+ + +Private Functions
+Private Members
+Public Members
+ + + + + + + + + + + + + + +Public Functions
+Public Members
+ + + + + + + + +[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
+Public Types
+Public Functions
+Public Static Functions
+Private Functions
+Private Members
+Private Static Attributes
+Public Types
+ + +Public Functions
+ + +Public Types
+ + +Public Functions
+Functions
+Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
+Public Types
+Public Functions
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+Private Members
+Subclassed by tensorrt_llm::runtime::GptDecoder< T >
+Public Types
+ + +Public Functions
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+Public Static Functions
+Public Functions
+Public Static Functions
+Private Members
+Functions
+Public Members
+Public Types
+ + +Public Functions
+Private Members
+ + +Typedefs
+ + +Enums
+ + +Functions
+ + + + +Gets a typed pointer to the constant underlying data of the buffer.
+T – The type of the underlying data.
+buffer – The buffer to get a pointer to.
+A pointer to constant T.
Gets a typed pointer to the underlying data of the buffer.
+T – The type of the underlying data.
+buffer – The buffer to get a pointer to.
+A pointer to T.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
+T – The type of the underlying data.
+bufferPtr – A possibly null shared ptr.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
+T – The type of the underlying data.
+bufferPtr – A possibly null shared ptr.
+A pointer to const T, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to const T, possibly nullptr.
+A wrapper around nvinfer1::DataType that provides a support for pointer types.
Public Functions
+Public Static Attributes
+ + +Public Types
+For converting a TensorRT data type to a C++ data type.
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Subclassed by tensorrt_llm::runtime::ITensor
+Public Types
+ + +Public Functions
+Returns a pointer to underlying array.
+Returns a pointer to underlying array.
+Returns a pointer to the underlying array at a given element index.
+Returns a pointer to the underlying array at a given element index.
+Returns the size (in number of elements) of the buffer.
+Returns the size (in bytes) of the buffer.
+Returns the capacity of the buffer.
+Returns the memory type of the buffer.
+Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+Releases the buffer. It will be reset to nullptr.
+Public Static Functions
+ + +Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer.
Returns a view on the underlying tensor which can be independently resized.
tensor – The tensor to view.
+A view on the tensor.
Returns a view on the underlying tensor with a different size.
tensor – The tensor to view.
size – The size of the view.
A view on the tensor.
Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.
data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.
An IBuffer.
Determine the memory type of a pointer.
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+For converting a C++ data type to a TensorRT data type.
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Private Static Attributes
+Public Static Attributes
+Public Functions
Public Static Functions
Private Members
Functions
+Configuration for LoraCachePageManager
+See LoraCache docs for description of pages, slots, and page blocks.
+Public Functions
+Private Members
+Use TensorRT Engine
+ @@ -699,9 +703,9 @@ diff --git a/_modules/tensorrt_llm/builder.html b/_modules/tensorrt_llm/builder.html index c93b9ac710..f01c67c178 100644 --- a/_modules/tensorrt_llm/builder.html +++ b/_modules/tensorrt_llm/builder.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -2048,9 +2052,9 @@ diff --git a/_modules/tensorrt_llm/disaggregated_params.html b/_modules/tensorrt_llm/disaggregated_params.html index 58dfa79de8..ea398371cb 100644 --- a/_modules/tensorrt_llm/disaggregated_params.html +++ b/_modules/tensorrt_llm/disaggregated_params.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -683,9 +687,9 @@ diff --git a/_modules/tensorrt_llm/executor/result.html b/_modules/tensorrt_llm/executor/result.html index 8ecd53315a..9e0298621d 100644 --- a/_modules/tensorrt_llm/executor/result.html +++ b/_modules/tensorrt_llm/executor/result.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1408,9 +1412,9 @@ diff --git a/_modules/tensorrt_llm/executor/utils.html b/_modules/tensorrt_llm/executor/utils.html index bc06a3ae39..982134ff06 100644 --- a/_modules/tensorrt_llm/executor/utils.html +++ b/_modules/tensorrt_llm/executor/utils.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -801,9 +805,9 @@ diff --git a/_modules/tensorrt_llm/functional.html b/_modules/tensorrt_llm/functional.html index e3424ee8ab..548d0b9036 100644 --- a/_modules/tensorrt_llm/functional.html +++ b/_modules/tensorrt_llm/functional.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -8785,9 +8789,9 @@ diff --git a/_modules/tensorrt_llm/layers/activation.html b/_modules/tensorrt_llm/layers/activation.html index 88e82eac59..9dc972f0ab 100644 --- a/_modules/tensorrt_llm/layers/activation.html +++ b/_modules/tensorrt_llm/layers/activation.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -657,9 +661,9 @@ diff --git a/_modules/tensorrt_llm/layers/attention.html b/_modules/tensorrt_llm/layers/attention.html index 4efea3481a..6628e7ea38 100644 --- a/_modules/tensorrt_llm/layers/attention.html +++ b/_modules/tensorrt_llm/layers/attention.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -3520,9 +3524,9 @@ diff --git a/_modules/tensorrt_llm/layers/cast.html b/_modules/tensorrt_llm/layers/cast.html index 172bab6f19..ae159f8e10 100644 --- a/_modules/tensorrt_llm/layers/cast.html +++ b/_modules/tensorrt_llm/layers/cast.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -664,9 +668,9 @@ diff --git a/_modules/tensorrt_llm/layers/conv.html b/_modules/tensorrt_llm/layers/conv.html index 49918bfd63..1e2a20a642 100644 --- a/_modules/tensorrt_llm/layers/conv.html +++ b/_modules/tensorrt_llm/layers/conv.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -913,9 +917,9 @@ diff --git a/_modules/tensorrt_llm/layers/embedding.html b/_modules/tensorrt_llm/layers/embedding.html index 0eb85aaab7..331450207e 100644 --- a/_modules/tensorrt_llm/layers/embedding.html +++ b/_modules/tensorrt_llm/layers/embedding.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1380,9 +1384,9 @@ diff --git a/_modules/tensorrt_llm/layers/linear.html b/_modules/tensorrt_llm/layers/linear.html index 37fe63cb85..9be3ea70d9 100644 --- a/_modules/tensorrt_llm/layers/linear.html +++ b/_modules/tensorrt_llm/layers/linear.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1228,9 +1232,9 @@ diff --git a/_modules/tensorrt_llm/layers/mlp.html b/_modules/tensorrt_llm/layers/mlp.html index 44ecaedb26..5c0af23ddd 100644 --- a/_modules/tensorrt_llm/layers/mlp.html +++ b/_modules/tensorrt_llm/layers/mlp.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1254,9 +1258,9 @@ diff --git a/_modules/tensorrt_llm/layers/normalization.html b/_modules/tensorrt_llm/layers/normalization.html index 38448474fc..95e583e5f3 100644 --- a/_modules/tensorrt_llm/layers/normalization.html +++ b/_modules/tensorrt_llm/layers/normalization.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1018,9 +1022,9 @@ diff --git a/_modules/tensorrt_llm/layers/pooling.html b/_modules/tensorrt_llm/layers/pooling.html index e157c16994..337580626b 100644 --- a/_modules/tensorrt_llm/layers/pooling.html +++ b/_modules/tensorrt_llm/layers/pooling.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -673,9 +677,9 @@ diff --git a/_modules/tensorrt_llm/llmapi/build_cache.html b/_modules/tensorrt_llm/llmapi/build_cache.html index aa9d33e63b..485dff2dca 100644 --- a/_modules/tensorrt_llm/llmapi/build_cache.html +++ b/_modules/tensorrt_llm/llmapi/build_cache.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -957,9 +961,9 @@ diff --git a/_modules/tensorrt_llm/llmapi/llm.html b/_modules/tensorrt_llm/llmapi/llm.html index 043c8e4aad..633f2dc8db 100644 --- a/_modules/tensorrt_llm/llmapi/llm.html +++ b/_modules/tensorrt_llm/llmapi/llm.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1680,9 +1684,9 @@ diff --git a/_modules/tensorrt_llm/llmapi/llm_args.html b/_modules/tensorrt_llm/llmapi/llm_args.html index 53b602a7fc..efd86450f9 100644 --- a/_modules/tensorrt_llm/llmapi/llm_args.html +++ b/_modules/tensorrt_llm/llmapi/llm_args.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -3520,9 +3524,9 @@ diff --git a/_modules/tensorrt_llm/llmapi/mm_encoder.html b/_modules/tensorrt_llm/llmapi/mm_encoder.html index 4e9f064c5b..8c892c4a06 100644 --- a/_modules/tensorrt_llm/llmapi/mm_encoder.html +++ b/_modules/tensorrt_llm/llmapi/mm_encoder.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -781,9 +785,9 @@ diff --git a/_modules/tensorrt_llm/llmapi/mpi_session.html b/_modules/tensorrt_llm/llmapi/mpi_session.html index 953d81f50c..2ce78f1e87 100644 --- a/_modules/tensorrt_llm/llmapi/mpi_session.html +++ b/_modules/tensorrt_llm/llmapi/mpi_session.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1159,9 +1163,9 @@ diff --git a/_modules/tensorrt_llm/models/baichuan/model.html b/_modules/tensorrt_llm/models/baichuan/model.html index 1cbf68141d..f9a332d2b4 100644 --- a/_modules/tensorrt_llm/models/baichuan/model.html +++ b/_modules/tensorrt_llm/models/baichuan/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -891,9 +895,9 @@ diff --git a/_modules/tensorrt_llm/models/bert/model.html b/_modules/tensorrt_llm/models/bert/model.html index 50d2d61f45..38a078b07e 100644 --- a/_modules/tensorrt_llm/models/bert/model.html +++ b/_modules/tensorrt_llm/models/bert/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1195,9 +1199,9 @@ diff --git a/_modules/tensorrt_llm/models/bloom/model.html b/_modules/tensorrt_llm/models/bloom/model.html index c47c02a031..7afce00939 100644 --- a/_modules/tensorrt_llm/models/bloom/model.html +++ b/_modules/tensorrt_llm/models/bloom/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -803,9 +807,9 @@ diff --git a/_modules/tensorrt_llm/models/chatglm/config.html b/_modules/tensorrt_llm/models/chatglm/config.html index 6b09e699d6..96d1bec488 100644 --- a/_modules/tensorrt_llm/models/chatglm/config.html +++ b/_modules/tensorrt_llm/models/chatglm/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -820,9 +824,9 @@ diff --git a/_modules/tensorrt_llm/models/chatglm/model.html b/_modules/tensorrt_llm/models/chatglm/model.html index 06ff71896e..4023c2dd9b 100644 --- a/_modules/tensorrt_llm/models/chatglm/model.html +++ b/_modules/tensorrt_llm/models/chatglm/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1019,9 +1023,9 @@ diff --git a/_modules/tensorrt_llm/models/clip/model.html b/_modules/tensorrt_llm/models/clip/model.html index 4a6e31c6db..e255cd15ef 100644 --- a/_modules/tensorrt_llm/models/clip/model.html +++ b/_modules/tensorrt_llm/models/clip/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -848,9 +852,9 @@ diff --git a/_modules/tensorrt_llm/models/cogvlm/config.html b/_modules/tensorrt_llm/models/cogvlm/config.html index 4fde960082..d60f6aecaa 100644 --- a/_modules/tensorrt_llm/models/cogvlm/config.html +++ b/_modules/tensorrt_llm/models/cogvlm/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -679,9 +683,9 @@ diff --git a/_modules/tensorrt_llm/models/cogvlm/model.html b/_modules/tensorrt_llm/models/cogvlm/model.html index 9d49e4d45f..4f4e83d385 100644 --- a/_modules/tensorrt_llm/models/cogvlm/model.html +++ b/_modules/tensorrt_llm/models/cogvlm/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -932,9 +936,9 @@ diff --git a/_modules/tensorrt_llm/models/commandr/model.html b/_modules/tensorrt_llm/models/commandr/model.html index fbd95869f6..5302d92a29 100644 --- a/_modules/tensorrt_llm/models/commandr/model.html +++ b/_modules/tensorrt_llm/models/commandr/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -830,9 +834,9 @@ diff --git a/_modules/tensorrt_llm/models/dbrx/config.html b/_modules/tensorrt_llm/models/dbrx/config.html index 4489889322..e2b197d3f8 100644 --- a/_modules/tensorrt_llm/models/dbrx/config.html +++ b/_modules/tensorrt_llm/models/dbrx/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -694,9 +698,9 @@ diff --git a/_modules/tensorrt_llm/models/dbrx/model.html b/_modules/tensorrt_llm/models/dbrx/model.html index 1decc3ef5e..21034d9a74 100644 --- a/_modules/tensorrt_llm/models/dbrx/model.html +++ b/_modules/tensorrt_llm/models/dbrx/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -820,9 +824,9 @@ diff --git a/_modules/tensorrt_llm/models/deepseek_v1/model.html b/_modules/tensorrt_llm/models/deepseek_v1/model.html index 2a75574f46..c02ad24e9a 100644 --- a/_modules/tensorrt_llm/models/deepseek_v1/model.html +++ b/_modules/tensorrt_llm/models/deepseek_v1/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -914,9 +918,9 @@ diff --git a/_modules/tensorrt_llm/models/deepseek_v2/model.html b/_modules/tensorrt_llm/models/deepseek_v2/model.html index 786e459892..402c358aa4 100644 --- a/_modules/tensorrt_llm/models/deepseek_v2/model.html +++ b/_modules/tensorrt_llm/models/deepseek_v2/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -996,9 +1000,9 @@ diff --git a/_modules/tensorrt_llm/models/dit/model.html b/_modules/tensorrt_llm/models/dit/model.html index 9af5f4e993..a3e1820f80 100644 --- a/_modules/tensorrt_llm/models/dit/model.html +++ b/_modules/tensorrt_llm/models/dit/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1032,9 +1036,9 @@ diff --git a/_modules/tensorrt_llm/models/eagle/model.html b/_modules/tensorrt_llm/models/eagle/model.html index 78a231cd60..5d74c7ac9c 100644 --- a/_modules/tensorrt_llm/models/eagle/model.html +++ b/_modules/tensorrt_llm/models/eagle/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1968,9 +1972,9 @@ diff --git a/_modules/tensorrt_llm/models/enc_dec/model.html b/_modules/tensorrt_llm/models/enc_dec/model.html index e0211c1bee..1e4cfb0407 100644 --- a/_modules/tensorrt_llm/models/enc_dec/model.html +++ b/_modules/tensorrt_llm/models/enc_dec/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -2875,9 +2879,9 @@ diff --git a/_modules/tensorrt_llm/models/falcon/config.html b/_modules/tensorrt_llm/models/falcon/config.html index 86635065d4..9f479a68a7 100644 --- a/_modules/tensorrt_llm/models/falcon/config.html +++ b/_modules/tensorrt_llm/models/falcon/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -755,9 +759,9 @@ diff --git a/_modules/tensorrt_llm/models/falcon/model.html b/_modules/tensorrt_llm/models/falcon/model.html index f9326397d7..3882a989b7 100644 --- a/_modules/tensorrt_llm/models/falcon/model.html +++ b/_modules/tensorrt_llm/models/falcon/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -917,9 +921,9 @@ diff --git a/_modules/tensorrt_llm/models/gemma/config.html b/_modules/tensorrt_llm/models/gemma/config.html index 8ca2b7f928..e4201ebafe 100644 --- a/_modules/tensorrt_llm/models/gemma/config.html +++ b/_modules/tensorrt_llm/models/gemma/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -845,9 +849,9 @@ diff --git a/_modules/tensorrt_llm/models/gemma/model.html b/_modules/tensorrt_llm/models/gemma/model.html index 6e9fe26584..8d4046902b 100644 --- a/_modules/tensorrt_llm/models/gemma/model.html +++ b/_modules/tensorrt_llm/models/gemma/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1037,9 +1041,9 @@ diff --git a/_modules/tensorrt_llm/models/gpt/config.html b/_modules/tensorrt_llm/models/gpt/config.html index 29e9008d36..c660650d3c 100644 --- a/_modules/tensorrt_llm/models/gpt/config.html +++ b/_modules/tensorrt_llm/models/gpt/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -964,9 +968,9 @@ diff --git a/_modules/tensorrt_llm/models/gpt/model.html b/_modules/tensorrt_llm/models/gpt/model.html index 9f66b6ea70..3e158f1d07 100644 --- a/_modules/tensorrt_llm/models/gpt/model.html +++ b/_modules/tensorrt_llm/models/gpt/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1067,9 +1071,9 @@ diff --git a/_modules/tensorrt_llm/models/gptj/config.html b/_modules/tensorrt_llm/models/gptj/config.html index f7b915530d..1c7f1cd28a 100644 --- a/_modules/tensorrt_llm/models/gptj/config.html +++ b/_modules/tensorrt_llm/models/gptj/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -693,9 +697,9 @@ diff --git a/_modules/tensorrt_llm/models/gptj/model.html b/_modules/tensorrt_llm/models/gptj/model.html index 5fc4679012..10da45c979 100644 --- a/_modules/tensorrt_llm/models/gptj/model.html +++ b/_modules/tensorrt_llm/models/gptj/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -845,9 +849,9 @@ diff --git a/_modules/tensorrt_llm/models/gptneox/model.html b/_modules/tensorrt_llm/models/gptneox/model.html index f38a2e0892..1a42faa086 100644 --- a/_modules/tensorrt_llm/models/gptneox/model.html +++ b/_modules/tensorrt_llm/models/gptneox/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -785,9 +789,9 @@ diff --git a/_modules/tensorrt_llm/models/llama/config.html b/_modules/tensorrt_llm/models/llama/config.html index fb25ece38f..866fdb79a4 100644 --- a/_modules/tensorrt_llm/models/llama/config.html +++ b/_modules/tensorrt_llm/models/llama/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -919,9 +923,9 @@ diff --git a/_modules/tensorrt_llm/models/llama/model.html b/_modules/tensorrt_llm/models/llama/model.html index 8d41b6678e..aa3891bc02 100644 --- a/_modules/tensorrt_llm/models/llama/model.html +++ b/_modules/tensorrt_llm/models/llama/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1267,9 +1271,9 @@ diff --git a/_modules/tensorrt_llm/models/mamba/model.html b/_modules/tensorrt_llm/models/mamba/model.html index 4feb4f8f4a..b9dbbc74bf 100644 --- a/_modules/tensorrt_llm/models/mamba/model.html +++ b/_modules/tensorrt_llm/models/mamba/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1112,9 +1116,9 @@ diff --git a/_modules/tensorrt_llm/models/medusa/config.html b/_modules/tensorrt_llm/models/medusa/config.html index 4cb02c554b..09610216a4 100644 --- a/_modules/tensorrt_llm/models/medusa/config.html +++ b/_modules/tensorrt_llm/models/medusa/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -752,9 +756,9 @@ diff --git a/_modules/tensorrt_llm/models/medusa/model.html b/_modules/tensorrt_llm/models/medusa/model.html index a1ad32b791..fe7017ca90 100644 --- a/_modules/tensorrt_llm/models/medusa/model.html +++ b/_modules/tensorrt_llm/models/medusa/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -902,9 +906,9 @@ diff --git a/_modules/tensorrt_llm/models/mllama/model.html b/_modules/tensorrt_llm/models/mllama/model.html index e65c4cc673..4ebd7661ed 100644 --- a/_modules/tensorrt_llm/models/mllama/model.html +++ b/_modules/tensorrt_llm/models/mllama/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -2213,9 +2217,9 @@ diff --git a/_modules/tensorrt_llm/models/mmdit_sd3/model.html b/_modules/tensorrt_llm/models/mmdit_sd3/model.html index a5ad2b500f..6cc434606d 100644 --- a/_modules/tensorrt_llm/models/mmdit_sd3/model.html +++ b/_modules/tensorrt_llm/models/mmdit_sd3/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1279,9 +1283,9 @@ diff --git a/_modules/tensorrt_llm/models/modeling_utils.html b/_modules/tensorrt_llm/models/modeling_utils.html index 3b8e245d81..a3f54ddd58 100644 --- a/_modules/tensorrt_llm/models/modeling_utils.html +++ b/_modules/tensorrt_llm/models/modeling_utils.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -2682,9 +2686,9 @@ diff --git a/_modules/tensorrt_llm/models/mpt/model.html b/_modules/tensorrt_llm/models/mpt/model.html index 995c5090da..0289c66bb7 100644 --- a/_modules/tensorrt_llm/models/mpt/model.html +++ b/_modules/tensorrt_llm/models/mpt/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -817,9 +821,9 @@ diff --git a/_modules/tensorrt_llm/models/multimodal_encoders/config.html b/_modules/tensorrt_llm/models/multimodal_encoders/config.html index fec831f277..c300fe6676 100644 --- a/_modules/tensorrt_llm/models/multimodal_encoders/config.html +++ b/_modules/tensorrt_llm/models/multimodal_encoders/config.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -751,9 +755,9 @@ diff --git a/_modules/tensorrt_llm/models/multimodal_encoders/model.html b/_modules/tensorrt_llm/models/multimodal_encoders/model.html index efaa39b574..2cc3ec0b04 100644 --- a/_modules/tensorrt_llm/models/multimodal_encoders/model.html +++ b/_modules/tensorrt_llm/models/multimodal_encoders/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -819,9 +823,9 @@ diff --git a/_modules/tensorrt_llm/models/opt/model.html b/_modules/tensorrt_llm/models/opt/model.html index 8304a7bc26..d738caafb7 100644 --- a/_modules/tensorrt_llm/models/opt/model.html +++ b/_modules/tensorrt_llm/models/opt/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -822,9 +826,9 @@ diff --git a/_modules/tensorrt_llm/models/phi/model.html b/_modules/tensorrt_llm/models/phi/model.html index 2400c9c0a1..784f26ff96 100644 --- a/_modules/tensorrt_llm/models/phi/model.html +++ b/_modules/tensorrt_llm/models/phi/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -866,9 +870,9 @@ diff --git a/_modules/tensorrt_llm/models/phi3/model.html b/_modules/tensorrt_llm/models/phi3/model.html index ef0fa7708b..93ad559e7b 100644 --- a/_modules/tensorrt_llm/models/phi3/model.html +++ b/_modules/tensorrt_llm/models/phi3/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -962,9 +966,9 @@ diff --git a/_modules/tensorrt_llm/models/recurrentgemma/model.html b/_modules/tensorrt_llm/models/recurrentgemma/model.html index acde6dcd92..1b179bb98e 100644 --- a/_modules/tensorrt_llm/models/recurrentgemma/model.html +++ b/_modules/tensorrt_llm/models/recurrentgemma/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1265,9 +1269,9 @@ diff --git a/_modules/tensorrt_llm/models/redrafter/model.html b/_modules/tensorrt_llm/models/redrafter/model.html index 36b83e21da..7f4406ad53 100644 --- a/_modules/tensorrt_llm/models/redrafter/model.html +++ b/_modules/tensorrt_llm/models/redrafter/model.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -952,9 +956,9 @@ diff --git a/_modules/tensorrt_llm/plugin/plugin.html b/_modules/tensorrt_llm/plugin/plugin.html index b3555e8b4a..4e4a0cb4c0 100644 --- a/_modules/tensorrt_llm/plugin/plugin.html +++ b/_modules/tensorrt_llm/plugin/plugin.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1493,9 +1497,9 @@ diff --git a/_modules/tensorrt_llm/quantization/mode.html b/_modules/tensorrt_llm/quantization/mode.html index 7711951ab3..6f1197d365 100644 --- a/_modules/tensorrt_llm/quantization/mode.html +++ b/_modules/tensorrt_llm/quantization/mode.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1095,9 +1099,9 @@ diff --git a/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html b/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html index c83a5b67c8..ca02da1741 100644 --- a/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html +++ b/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1908,9 +1912,9 @@ diff --git a/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html b/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html index 32e0efe020..4eb25e3e27 100644 --- a/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html +++ b/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1176,9 +1180,9 @@ diff --git a/_modules/tensorrt_llm/runtime/generation.html b/_modules/tensorrt_llm/runtime/generation.html index 16f38fa9e7..1923e8b98d 100644 --- a/_modules/tensorrt_llm/runtime/generation.html +++ b/_modules/tensorrt_llm/runtime/generation.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -5467,9 +5471,9 @@ diff --git a/_modules/tensorrt_llm/runtime/kv_cache_manager.html b/_modules/tensorrt_llm/runtime/kv_cache_manager.html index 6f6fa98c98..e48a781e1f 100644 --- a/_modules/tensorrt_llm/runtime/kv_cache_manager.html +++ b/_modules/tensorrt_llm/runtime/kv_cache_manager.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1123,9 +1127,9 @@ diff --git a/_modules/tensorrt_llm/runtime/model_runner.html b/_modules/tensorrt_llm/runtime/model_runner.html index 0492bf4189..43693b8db6 100644 --- a/_modules/tensorrt_llm/runtime/model_runner.html +++ b/_modules/tensorrt_llm/runtime/model_runner.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1639,9 +1643,9 @@ diff --git a/_modules/tensorrt_llm/runtime/model_runner_cpp.html b/_modules/tensorrt_llm/runtime/model_runner_cpp.html index ec55283b17..d5e766152b 100644 --- a/_modules/tensorrt_llm/runtime/model_runner_cpp.html +++ b/_modules/tensorrt_llm/runtime/model_runner_cpp.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1849,9 +1853,9 @@ diff --git a/_modules/tensorrt_llm/runtime/multimodal_model_runner.html b/_modules/tensorrt_llm/runtime/multimodal_model_runner.html index eb013aefd5..3d39104a77 100644 --- a/_modules/tensorrt_llm/runtime/multimodal_model_runner.html +++ b/_modules/tensorrt_llm/runtime/multimodal_model_runner.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -3439,9 +3443,9 @@ diff --git a/_modules/tensorrt_llm/runtime/session.html b/_modules/tensorrt_llm/runtime/session.html index 5403f81128..72e9d52958 100644 --- a/_modules/tensorrt_llm/runtime/session.html +++ b/_modules/tensorrt_llm/runtime/session.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -983,9 +987,9 @@ diff --git a/_modules/tensorrt_llm/sampling_params.html b/_modules/tensorrt_llm/sampling_params.html index 02e20fc6e6..bd156e9dbc 100644 --- a/_modules/tensorrt_llm/sampling_params.html +++ b/_modules/tensorrt_llm/sampling_params.html @@ -58,7 +58,7 @@ @@ -68,7 +68,7 @@ - + @@ -464,6 +464,10 @@Use TensorRT Engine
+ @@ -1121,9 +1125,9 @@ diff --git a/_sources/_cpp_gen/executor.rst.txt b/_sources/_cpp_gen/executor.rst.txt index d3ca9cd473..39b9a6f5a4 100644 --- a/_sources/_cpp_gen/executor.rst.txt +++ b/_sources/_cpp_gen/executor.rst.txt @@ -4,6 +4,24 @@ Executor .. Here are files in the cpp/include/executor .. We manually add subsection to enable detailed description in the future .. It is also doable to automatically generate this file and list all the modules in the conf.py +transferAgent.h +_______________ + +.. doxygenfile:: transferAgent.h + :project: TensorRT-LLM + +types.h +_______ + +.. doxygenfile:: types.h + :project: TensorRT-LLM + +cacheCommunicator.h +___________________ + +.. doxygenfile:: cacheCommunicator.h + :project: TensorRT-LLM + disaggServerUtil.h __________________ @@ -16,24 +34,6 @@ ________ .. doxygenfile:: tensor.h :project: TensorRT-LLM -transferAgent.h -_______________ - -.. doxygenfile:: transferAgent.h - :project: TensorRT-LLM - -serialization.h -_______________ - -.. doxygenfile:: serialization.h - :project: TensorRT-LLM - -types.h -_______ - -.. doxygenfile:: types.h - :project: TensorRT-LLM - executor.h __________ @@ -46,9 +46,9 @@ ______________________ .. doxygenfile:: dataTransceiverState.h :project: TensorRT-LLM -cacheCommunicator.h -___________________ +serialization.h +_______________ -.. doxygenfile:: cacheCommunicator.h +.. doxygenfile:: serialization.h :project: TensorRT-LLM diff --git a/_sources/_cpp_gen/runtime.rst.txt b/_sources/_cpp_gen/runtime.rst.txt index 536188f7ce..b8dd953966 100644 --- a/_sources/_cpp_gen/runtime.rst.txt +++ b/_sources/_cpp_gen/runtime.rst.txt @@ -4,148 +4,22 @@ Runtime .. Here are files in the cpp/include/runtime .. We manually add subsection to enable detailed description in the future .. It is also doable to automatically generate this file and list all the modules in the conf.py -lookaheadBuffers.h -__________________ - -.. doxygenfile:: lookaheadBuffers.h - :project: TensorRT-LLM - -lookaheadModule.h -_________________ - -.. doxygenfile:: lookaheadModule.h - :project: TensorRT-LLM - -iBuffer.h -_________ - -.. doxygenfile:: iBuffer.h - :project: TensorRT-LLM - -modelConfig.h -_____________ - -.. doxygenfile:: modelConfig.h - :project: TensorRT-LLM - -decodingOutput.h -________________ - -.. doxygenfile:: decodingOutput.h - :project: TensorRT-LLM - -promptTuningParams.h -____________________ - -.. doxygenfile:: promptTuningParams.h - :project: TensorRT-LLM - -bufferManager.h -_______________ - -.. doxygenfile:: bufferManager.h - :project: TensorRT-LLM - -gptJsonConfig.h -_______________ - -.. doxygenfile:: gptJsonConfig.h - :project: TensorRT-LLM - -runtimeDefaults.h -_________________ - -.. doxygenfile:: runtimeDefaults.h - :project: TensorRT-LLM - -loraCache.h -___________ - -.. doxygenfile:: loraCache.h - :project: TensorRT-LLM - -rawEngine.h -___________ - -.. doxygenfile:: rawEngine.h - :project: TensorRT-LLM - -gptDecoder.h -____________ - -.. doxygenfile:: gptDecoder.h - :project: TensorRT-LLM - -eagleBuffers.h -______________ - -.. doxygenfile:: eagleBuffers.h - :project: TensorRT-LLM - -medusaModule.h -______________ - -.. doxygenfile:: medusaModule.h - :project: TensorRT-LLM - -virtualMemory.h -_______________ - -.. doxygenfile:: virtualMemory.h - :project: TensorRT-LLM - -explicitDraftTokensBuffers.h -____________________________ - -.. doxygenfile:: explicitDraftTokensBuffers.h - :project: TensorRT-LLM - iTensor.h _________ .. doxygenfile:: iTensor.h :project: TensorRT-LLM -common.h -________ - -.. doxygenfile:: common.h - :project: TensorRT-LLM - -loraCachePageManagerConfig.h -____________________________ - -.. doxygenfile:: loraCachePageManagerConfig.h - :project: TensorRT-LLM - -worldConfig.h -_____________ - -.. doxygenfile:: worldConfig.h - :project: TensorRT-LLM - -loraModule.h -____________ - -.. doxygenfile:: loraModule.h - :project: TensorRT-LLM - -speculativeDecodingMode.h -_________________________ - -.. doxygenfile:: speculativeDecodingMode.h - :project: TensorRT-LLM - cudaEvent.h ___________ .. doxygenfile:: cudaEvent.h :project: TensorRT-LLM -decodingInput.h +virtualMemory.h _______________ -.. doxygenfile:: decodingInput.h +.. doxygenfile:: virtualMemory.h :project: TensorRT-LLM speculativeDecodingModule.h @@ -154,40 +28,10 @@ ___________________________ .. doxygenfile:: speculativeDecodingModule.h :project: TensorRT-LLM -iGptDecoderBatched.h -____________________ +common.h +________ -.. doxygenfile:: iGptDecoderBatched.h - :project: TensorRT-LLM - -eagleModule.h -_____________ - -.. doxygenfile:: eagleModule.h - :project: TensorRT-LLM - -tllmLogger.h -____________ - -.. doxygenfile:: tllmLogger.h - :project: TensorRT-LLM - -gptDecoderBatched.h -___________________ - -.. doxygenfile:: gptDecoderBatched.h - :project: TensorRT-LLM - -cudaStream.h -____________ - -.. doxygenfile:: cudaStream.h - :project: TensorRT-LLM - -ipcNvlsMemory.h -_______________ - -.. doxygenfile:: ipcNvlsMemory.h +.. doxygenfile:: common.h :project: TensorRT-LLM samplingConfig.h @@ -196,16 +40,136 @@ ________________ .. doxygenfile:: samplingConfig.h :project: TensorRT-LLM +tllmLogger.h +____________ + +.. doxygenfile:: tllmLogger.h + :project: TensorRT-LLM + +lookaheadModule.h +_________________ + +.. doxygenfile:: lookaheadModule.h + :project: TensorRT-LLM + +modelConfig.h +_____________ + +.. doxygenfile:: modelConfig.h + :project: TensorRT-LLM + +iGptDecoderBatched.h +____________________ + +.. doxygenfile:: iGptDecoderBatched.h + :project: TensorRT-LLM + +cudaStream.h +____________ + +.. doxygenfile:: cudaStream.h + :project: TensorRT-LLM + +loraCache.h +___________ + +.. doxygenfile:: loraCache.h + :project: TensorRT-LLM + +medusaModule.h +______________ + +.. doxygenfile:: medusaModule.h + :project: TensorRT-LLM + decoderState.h ______________ .. doxygenfile:: decoderState.h :project: TensorRT-LLM -ipcUtils.h -__________ +lookaheadBuffers.h +__________________ -.. doxygenfile:: ipcUtils.h +.. doxygenfile:: lookaheadBuffers.h + :project: TensorRT-LLM + +eagleModule.h +_____________ + +.. doxygenfile:: eagleModule.h + :project: TensorRT-LLM + +runtimeDefaults.h +_________________ + +.. doxygenfile:: runtimeDefaults.h + :project: TensorRT-LLM + +decodingOutput.h +________________ + +.. doxygenfile:: decodingOutput.h + :project: TensorRT-LLM + +decodingInput.h +_______________ + +.. doxygenfile:: decodingInput.h + :project: TensorRT-LLM + +worldConfig.h +_____________ + +.. doxygenfile:: worldConfig.h + :project: TensorRT-LLM + +gptDecoderBatched.h +___________________ + +.. doxygenfile:: gptDecoderBatched.h + :project: TensorRT-LLM + +explicitDraftTokensBuffers.h +____________________________ + +.. doxygenfile:: explicitDraftTokensBuffers.h + :project: TensorRT-LLM + +bufferManager.h +_______________ + +.. doxygenfile:: bufferManager.h + :project: TensorRT-LLM + +loraModule.h +____________ + +.. doxygenfile:: loraModule.h + :project: TensorRT-LLM + +eagleBuffers.h +______________ + +.. doxygenfile:: eagleBuffers.h + :project: TensorRT-LLM + +speculativeDecodingMode.h +_________________________ + +.. doxygenfile:: speculativeDecodingMode.h + :project: TensorRT-LLM + +promptTuningParams.h +____________________ + +.. doxygenfile:: promptTuningParams.h + :project: TensorRT-LLM + +gptDecoder.h +____________ + +.. doxygenfile:: gptDecoder.h :project: TensorRT-LLM memoryCounters.h @@ -214,3 +178,39 @@ ________________ .. doxygenfile:: memoryCounters.h :project: TensorRT-LLM +ipcNvlsMemory.h +_______________ + +.. doxygenfile:: ipcNvlsMemory.h + :project: TensorRT-LLM + +rawEngine.h +___________ + +.. doxygenfile:: rawEngine.h + :project: TensorRT-LLM + +ipcUtils.h +__________ + +.. doxygenfile:: ipcUtils.h + :project: TensorRT-LLM + +iBuffer.h +_________ + +.. doxygenfile:: iBuffer.h + :project: TensorRT-LLM + +gptJsonConfig.h +_______________ + +.. doxygenfile:: gptJsonConfig.h + :project: TensorRT-LLM + +loraCachePageManagerConfig.h +____________________________ + +.. doxygenfile:: loraCachePageManagerConfig.h + :project: TensorRT-LLM + diff --git a/_sources/examples/llm_runtime.rst.txt b/_sources/examples/llm_runtime.rst.txt index 163be13f79..54bcd0180a 100644 --- a/_sources/examples/llm_runtime.rst.txt +++ b/_sources/examples/llm_runtime.rst.txt @@ -3,6 +3,6 @@ Runtime Configuration Examples Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_runtime.py. .. literalinclude:: ../../../examples/llm-api/llm_runtime.py - :lines: 4-97 + :lines: 4-96 :language: python :linenos: diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt index b7b7fe7ef5..38b7662ed5 100644 --- a/_sources/index.rst.txt +++ b/_sources/index.rst.txt @@ -160,6 +160,12 @@ Welcome to TensorRT-LLM's Documentation! blogs/XQA-kernel.md blogs/tech_blog/* +.. toctree:: + :maxdepth: 2 + :caption: Use TensorRT Engine + :hidden: + + legacy/tensorrt_quickstart.md Indices and tables ================== diff --git a/_sources/legacy/tensorrt_quickstart.md.txt b/_sources/legacy/tensorrt_quickstart.md.txt new file mode 100644 index 0000000000..df62aa38d7 --- /dev/null +++ b/_sources/legacy/tensorrt_quickstart.md.txt @@ -0,0 +1,9 @@ +# LLM API with TensorRT Engine +A simple inference example with TinyLlama using the LLM API: + +```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py + :language: python + :linenos: +``` + +For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md). diff --git a/_sources/llm-api/reference.rst.txt b/_sources/llm-api/reference.rst.txt index 9481d4c057..450f4f3741 100644 --- a/_sources/llm-api/reference.rst.txt +++ b/_sources/llm-api/reference.rst.txt @@ -280,7 +280,7 @@ API Reference :special-members: __init__ :member-order: groupwise :inherited-members: - :exclude-members: parse_obj,model_fields,model_fields_set,model_validate_json,dict,model_dump_json,model_computed_fields,parse_file,construct,model_copy,json,update_forward_refs,model_config,copy,from_orm,model_rebuild,model_validate,schema_json,parse_raw,model_dump,model_extra,model_post_init,validate,model_parametrized_name,model_json_schema,model_construct,schema,model_validate_strings + :exclude-members: model_extra,model_copy,model_validate_strings,model_dump_json,model_validate,copy,model_fields_set,construct,from_orm,json,model_construct,parse_raw,model_post_init,model_parametrized_name,schema,parse_obj,model_fields,model_validate_json,model_computed_fields,update_forward_refs,dict,model_json_schema,parse_file,model_dump,validate,schema_json,model_rebuild,model_config .. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs :members: @@ -289,7 +289,7 @@ API Reference :special-members: __init__ :member-order: groupwise :inherited-members: - :exclude-members: parse_obj,model_fields,model_fields_set,model_validate_json,dict,model_dump_json,model_computed_fields,parse_file,construct,model_copy,json,update_forward_refs,model_config,copy,from_orm,model_rebuild,model_validate,schema_json,parse_raw,model_dump,model_extra,model_post_init,validate,model_parametrized_name,model_json_schema,model_construct,schema,model_validate_strings + :exclude-members: model_extra,model_copy,model_validate_strings,model_dump_json,model_validate,copy,model_fields_set,construct,from_orm,json,model_construct,parse_raw,model_post_init,model_parametrized_name,schema,parse_obj,model_fields,model_validate_json,model_computed_fields,update_forward_refs,dict,model_json_schema,parse_file,model_dump,validate,schema_json,model_rebuild,model_config .. autoclass:: tensorrt_llm.llmapi.AutoDecodingConfig :members: diff --git a/advanced/disaggregated-service.html b/advanced/disaggregated-service.html index 5e5dd08a48..79204678ca 100644 --- a/advanced/disaggregated-service.html +++ b/advanced/disaggregated-service.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@Use TensorRT Engine
+ @@ -733,9 +737,9 @@ This feature is currently in prototype, and the related API is subjected to chan diff --git a/advanced/executor.html b/advanced/executor.html index e14b5d91d3..7f0e3c3ef0 100644 --- a/advanced/executor.html +++ b/advanced/executor.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@Use TensorRT Engine
+ @@ -824,9 +828,9 @@ the TensorRT-LLM C++ Executor API. diff --git a/advanced/expert-parallelism.html b/advanced/expert-parallelism.html index cedd151a0e..3bb0e3a740 100644 --- a/advanced/expert-parallelism.html +++ b/advanced/expert-parallelism.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@Use TensorRT Engine
+ @@ -690,9 +694,9 @@ diff --git a/advanced/gpt-attention.html b/advanced/gpt-attention.html index 40a8994ad0..4b872c7506 100644 --- a/advanced/gpt-attention.html +++ b/advanced/gpt-attention.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@Use TensorRT Engine
+ @@ -1007,9 +1011,9 @@ is computed as: diff --git a/advanced/gpt-runtime.html b/advanced/gpt-runtime.html index b711f7c1bd..6fe743d539 100644 --- a/advanced/gpt-runtime.html +++ b/advanced/gpt-runtime.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@Use TensorRT Engine
+ @@ -1050,9 +1054,9 @@ TheGptDecoder
diff --git a/advanced/graph-rewriting.html b/advanced/graph-rewriting.html
index 258c403274..58f51277a2 100644
--- a/advanced/graph-rewriting.html
+++ b/advanced/graph-rewriting.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -870,9 +874,9 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
diff --git a/advanced/kv-cache-management.html b/advanced/kv-cache-management.html
index 18095f7f51..4a1e1ec227 100644
--- a/advanced/kv-cache-management.html
+++ b/advanced/kv-cache-management.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -775,9 +779,9 @@ An “event” is any significant change in the lifecycle or state of a KV cache
diff --git a/advanced/kv-cache-reuse.html b/advanced/kv-cache-reuse.html
index 5fa6b7dd2a..6fbef77631 100644
--- a/advanced/kv-cache-reuse.html
+++ b/advanced/kv-cache-reuse.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -748,9 +752,9 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
diff --git a/advanced/lora.html b/advanced/lora.html
index 3ae1b39ea8..1f17036577 100644
--- a/advanced/lora.html
+++ b/advanced/lora.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -862,9 +866,9 @@ The shape of LoraWe
diff --git a/advanced/lowprecision-pcie-allreduce.html b/advanced/lowprecision-pcie-allreduce.html
index dd71ccd54a..a5abaff4ba 100644
--- a/advanced/lowprecision-pcie-allreduce.html
+++ b/advanced/lowprecision-pcie-allreduce.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -469,6 +469,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -711,9 +715,9 @@ This feature is optimized for PCIe-based GPU topologies and may affect model acc
diff --git a/advanced/open-sourced-cutlass-kernels.html b/advanced/open-sourced-cutlass-kernels.html
index 873a3506a5..f1780e3bd8 100644
--- a/advanced/open-sourced-cutlass-kernels.html
+++ b/advanced/open-sourced-cutlass-kernels.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -469,6 +469,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -666,9 +670,9 @@ Note that support for these static libraries will be gradually deprioritized in
diff --git a/advanced/speculative-decoding.html b/advanced/speculative-decoding.html
index a7cf2b178d..484d8386b6 100644
--- a/advanced/speculative-decoding.html
+++ b/advanced/speculative-decoding.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -862,9 +866,9 @@ However, similar to any new model, you can follow the same approach to define yo
diff --git a/advanced/weight-streaming.html b/advanced/weight-streaming.html
index d40f6f853d..cb69e73566 100644
--- a/advanced/weight-streaming.html
+++ b/advanced/weight-streaming.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -469,6 +469,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -699,9 +703,9 @@ python3 examples/summarize.py
diff --git a/architecture/add-model.html b/architecture/add-model.html
index ebf69cb5cb..c797a0e8d8 100644
--- a/architecture/add-model.html
+++ b/architecture/add-model.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -761,9 +765,9 @@ python ../summarize.py --engine_di
diff --git a/architecture/checkpoint.html b/architecture/checkpoint.html
index dcd7f99eec..35752e1d37 100644
--- a/architecture/checkpoint.html
+++ b/architecture/checkpoint.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -1028,9 +1032,9 @@ trtllm-build --checkpoint_dir ./op
diff --git a/architecture/core-concepts.html b/architecture/core-concepts.html
index 99f97c90dc..38ee98983d 100644
--- a/architecture/core-concepts.html
+++ b/architecture/core-concepts.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -471,6 +471,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -1032,9 +1036,9 @@ srun \
diff --git a/architecture/model-weights-loader.html b/architecture/model-weights-loader.html
index 6527d20418..ba51c75e12 100644
--- a/architecture/model-weights-loader.html
+++ b/architecture/model-weights-loader.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -469,6 +469,10 @@
- Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
- Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+Use TensorRT Engine
+
@@ -950,9 +954,9 @@ The support for Qwen-1 is in Use TensorRT Engine
+ @@ -877,9 +881,9 @@ pip install ./build/tensorrt_llm*. diff --git a/installation/containers.html b/installation/containers.html index c01c6d0815..947d7dc51b 100644 --- a/installation/containers.html +++ b/installation/containers.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -467,6 +467,10 @@Use TensorRT Engine
+ @@ -525,7 +529,7 @@ on NGC. This is likely the simplest way to obtain TensorRT-LLM. Please refer toContainer image tags
In the example shell commands, x.y.z corresponds to the TensorRT-LLM container
version to use. If omitted, IMAGE_TAG will default to tensorrt_llm.__version__
-(e.g., this documentation was generated from the 1.1.0rc2 source tree).
+(e.g., this documentation was generated from the 1.1.0rc3 source tree).
If this does not work, e.g., because a container for the version you are
currently working with has not been released yet, you can try using a
container published for a previous
@@ -665,9 +669,9 @@ for all related options.
Use TensorRT Engine
+ @@ -548,39 +552,45 @@ image hosted on NGCThis project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
Sanity check the installation by running the following in Python (tested on Python 3.12):
- 1from tensorrt_llm import LLM, SamplingParams
- 2
+ 1from tensorrt_llm import BuildConfig, SamplingParams
+ 2from tensorrt_llm._tensorrt_engine import LLM # NOTE the change
3
- 4def main():
- 5
- 6 # Model could accept HF model name, a path to local HF model,
- 7 # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
- 8 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
- 9
-10 # Sample prompts.
-11 prompts = [
-12 "Hello, my name is",
-13 "The capital of France is",
-14 "The future of AI is",
-15 ]
-16
-17 # Create a sampling params.
-18 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-19
-20 for output in llm.generate(prompts, sampling_params):
-21 print(
-22 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-23 )
-24
-25 # Got output like
-26 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
-27 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
-28 # Prompt: 'The capital of France is', Generated text: 'Paris.'
-29 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+ 4
+ 5def main():
+ 6
+ 7 build_config = BuildConfig()
+ 8 build_config.max_batch_size = 256
+ 9 build_config.max_num_tokens = 1024
+10
+11 # Model could accept HF model name, a path to local HF model,
+12 # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
+13 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+14 build_config=build_config)
+15
+16 # Sample prompts.
+17 prompts = [
+18 "Hello, my name is",
+19 "The capital of France is",
+20 "The future of AI is",
+21 ]
+22
+23 # Create a sampling params.
+24 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+25
+26 for output in llm.generate(prompts, sampling_params):
+27 print(
+28 f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+29 )
30
-31
-32if __name__ == '__main__':
-33 main()
+31 # Got output like
+32 # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+33 # Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+34 # Prompt: 'The capital of France is', Generated text: 'Paris.'
+35 # Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
+36
+37
+38if __name__ == '__main__':
+39 main()
@@ -729,9 +739,9 @@ The setup methods depends on your slurm configuration, pls check with your admin
diff --git a/key-features.html b/key-features.html
index 27a15ba906..627d857609 100644
--- a/key-features.html
+++ b/key-features.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -467,6 +467,10 @@
Use TensorRT Engine
+A simple inference example with TinyLlama using the LLM API:
+For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this README.
+Use TensorRT Engine
+ @@ -735,9 +739,9 @@ diff --git a/llm-api/reference.html b/llm-api/reference.html index 053d342f31..e2670a3890 100644 --- a/llm-api/reference.html +++ b/llm-api/reference.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -471,6 +471,10 @@Use TensorRT Engine
+ @@ -20781,9 +20785,9 @@ If rebuilding _was_ required, returns True if rebuilding was succes diff --git a/objects.inv b/objects.inv index a7f75c6922460231af09d7be43329d63de800eb1..b134cd55d9d8711d770f6c72a4f747e77a08407d 100644 GIT binary patch delta 24529 zcmV(+K;6IBrV5^?3Xn$uGl58jNCCA-0y`vsB$S}^dEK;qtoE&@`t8kct?S9GYG90W z2lmB#%U78?1f
z9_7yZW;W<}UE81B8%B>Fao>z}z)^U_C|&8JvYHOph5_#bPt2o683BD&8xKiz0=72$
zIuzKMV22jZ=BMTN(1UMU5AIV(>IPa>i&cGERsdDCP@Cw2 3%=deLmkVt^oe-7Dfhq
zkh}4BhhZP8T3%JlJL{lc0n|TgsGsqt0gFHH9L!SxU3Y%$I?G4T0zHADu&Nf;llbQl
z340~b_;S5+Yu_5jox6_E4R2R4F;`3I9%+B{N7tt1X%MnprK{Ox`UC=SxL-9O%ZEER
zy*RyFe#FlGi-Ywbe7ShJxPw7|J5S4tceO11^>s=&XDR(ry@x0I7P(klU;O&}o7Zn%
zT)bq2T>Sm?w{|`%q^d%y8k0J!`TlaB@5##f%hb%FKxega9>~ezjK2Kga&hZwwDNyk
zGxyccowt%+HCImX#Zfi6glT`}=9fRtRv**szse`WPxW83GM#Sg);BPN&X>TlL2RW}
zwX_O^Rh_JQW}cmQkgprabAT-U0Kfn2XT0mAUWaEl&9pN6w49|f)AX0Nt%!J6b<@vB
zEjJX{U9UbZtF>Ea^`tdB9bne8TX%n{p#=*Ej6vwVS84Te5tm!2e7JQtAK~85xARpu
zF2)k7Rzi(ky>A4+tuHrt-+$bz7;7W?BPbR1KQGM3;;{rhOqaY{F>1?*GK`4iWypr<
zt$7OW7kV8Eb MQnYGc9$T|8jXjWDjOL8<47FancxOcE@Q*9$gJi0se2ZkBb}lax>gB`MklGv0rxTp^mnep?c3J
zhL%u&)UPI2b>xQc|Gt0Q;W59(cvQjLz8t*mKkB#rM 3-G8~ux9Mte
z`{mWgyX!y7FJ;ZcfZ@gM7QjZ%;qi)_CY6c|mXv&Z7LS`@ep-5rqO~0RauSc!mOa
z2LaQU@r?puBSW
=0|?
z0^4(qUw1yTf05Fij%-5=yQ7gOSMy~^paxy1#oZ)DhM`9eApj7QSEYRti%k~PSU6P3
z;ca2_)A=d#Dp9J?vYl0IllfN;9{*uSA$c4|J&^2;s(Cl4EIbvZ}4U8mz{d3qpfAA
z-t&o}CDb4FtI1Uzx#9c2@85QK%x^IsRq(bi2XFh2`fdNw2;A0uO@FrSolW<&7Tc1~
zdyl)fwA2Gl^!L;A>fPDd&gOZjRZ9LA{(W8X@Cd!C#cON4@6k~8BUS&%lls*!{R_W5
zOvftx#B3SxEELtpYerEO%*Qli`v}lHP*aN8S@Ysw)tsZLaG-T_2UR5+aoAAj?JO{^S$P#Wvz%L05(5#Q2Wu5kfgPz|WbMHUgn5$&9}SI^
zq7w=1SnZV8$NC~*9Puc?w)e1xf`472m|!Gvv`RIX>Y(albJ`A5I=)S-+j}XU4lRA4
zH$D?yiL8(wbOsLgrw=A2n2r>lcQ6d}IhEpbP)ai5LJ8kKi{x0$3LUl~`K%oD!e^lb
z*aRy%;8|k$Tk5134bd@ei6+lLDQF%7g=K<+#cZh!(i>Sb5hFxftV5`kd6R>
zqt{Rx9drOZ^`j&O+aKl^$xUY5#lz<7-_Al}ZBa)ofhm@XL>#q-bFgkA)=bd$I4qQA
zLOV)