From 1d509f9205782f9683dde1154bdd70e6ad7cd7c1 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 7 Aug 2025 06:26:14 +0000 Subject: [PATCH] Update GitHub pages in root to v1.0.0rc6 --- .buildinfo | 2 +- _cpp_gen/executor.html | 5368 ++--- _cpp_gen/runtime.html | 17837 +++++++++------- .../attention.py | 249 +- .../b6815cf245cc7dc7a26a6f727fdc2dc4/model.py | 39 +- .../model_engine.py | 50 +- .../llm_args.py | 57 +- _modules/index.html | 14 +- _modules/tensorrt_llm/builder.html | 14 +- .../tensorrt_llm/disaggregated_params.html | 14 +- _modules/tensorrt_llm/executor/result.html | 14 +- _modules/tensorrt_llm/executor/utils.html | 14 +- _modules/tensorrt_llm/functional.html | 14 +- _modules/tensorrt_llm/layers/activation.html | 14 +- _modules/tensorrt_llm/layers/attention.html | 14 +- _modules/tensorrt_llm/layers/cast.html | 14 +- _modules/tensorrt_llm/layers/conv.html | 14 +- _modules/tensorrt_llm/layers/embedding.html | 14 +- _modules/tensorrt_llm/layers/linear.html | 14 +- _modules/tensorrt_llm/layers/mlp.html | 14 +- .../tensorrt_llm/layers/normalization.html | 14 +- _modules/tensorrt_llm/layers/pooling.html | 14 +- _modules/tensorrt_llm/llmapi/build_cache.html | 14 +- _modules/tensorrt_llm/llmapi/llm.html | 33 +- _modules/tensorrt_llm/llmapi/llm_args.html | 80 +- _modules/tensorrt_llm/llmapi/mpi_session.html | 14 +- .../tensorrt_llm/models/baichuan/model.html | 14 +- _modules/tensorrt_llm/models/bert/model.html | 14 +- _modules/tensorrt_llm/models/bloom/model.html | 14 +- .../tensorrt_llm/models/chatglm/config.html | 14 +- .../tensorrt_llm/models/chatglm/model.html | 14 +- _modules/tensorrt_llm/models/clip/model.html | 14 +- .../tensorrt_llm/models/cogvlm/config.html | 14 +- .../tensorrt_llm/models/cogvlm/model.html | 14 +- .../tensorrt_llm/models/commandr/model.html | 14 +- _modules/tensorrt_llm/models/dbrx/config.html | 14 +- _modules/tensorrt_llm/models/dbrx/model.html | 14 +- .../models/deepseek_v1/model.html | 14 +- .../models/deepseek_v2/model.html | 14 +- _modules/tensorrt_llm/models/dit/model.html | 14 +- _modules/tensorrt_llm/models/eagle/model.html | 14 +- .../tensorrt_llm/models/enc_dec/model.html | 14 +- .../tensorrt_llm/models/falcon/config.html | 14 +- .../tensorrt_llm/models/falcon/model.html | 14 +- .../tensorrt_llm/models/gemma/config.html | 14 +- _modules/tensorrt_llm/models/gemma/model.html | 14 +- _modules/tensorrt_llm/models/gpt/config.html | 14 +- _modules/tensorrt_llm/models/gpt/model.html | 14 +- _modules/tensorrt_llm/models/gptj/config.html | 14 +- _modules/tensorrt_llm/models/gptj/model.html | 14 +- .../tensorrt_llm/models/gptneox/model.html | 14 +- .../tensorrt_llm/models/llama/config.html | 14 +- _modules/tensorrt_llm/models/llama/model.html | 14 +- _modules/tensorrt_llm/models/mamba/model.html | 14 +- .../tensorrt_llm/models/medusa/config.html | 14 +- .../tensorrt_llm/models/medusa/model.html | 14 +- .../tensorrt_llm/models/mllama/model.html | 14 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 14 +- .../tensorrt_llm/models/modeling_utils.html | 14 +- _modules/tensorrt_llm/models/mpt/model.html | 14 +- .../models/multimodal_encoders/config.html | 14 +- .../models/multimodal_encoders/model.html | 14 +- _modules/tensorrt_llm/models/opt/model.html | 14 +- _modules/tensorrt_llm/models/phi/model.html | 14 +- _modules/tensorrt_llm/models/phi3/model.html | 14 +- .../models/recurrentgemma/model.html | 14 +- .../tensorrt_llm/models/redrafter/model.html | 14 +- _modules/tensorrt_llm/plugin/plugin.html | 14 +- _modules/tensorrt_llm/quantization/mode.html | 14 +- .../quantization/quantize_by_modelopt.html | 14 +- .../runtime/enc_dec_model_runner.html | 14 +- _modules/tensorrt_llm/runtime/generation.html | 14 +- .../runtime/kv_cache_manager.html | 14 +- .../tensorrt_llm/runtime/model_runner.html | 14 +- .../runtime/model_runner_cpp.html | 14 +- .../runtime/multimodal_model_runner.html | 14 +- _modules/tensorrt_llm/runtime/session.html | 14 +- _modules/tensorrt_llm/sampling_params.html | 71 +- _sources/_cpp_gen/executor.rst.txt | 42 +- _sources/_cpp_gen/runtime.rst.txt | 314 +- ..._Expert_Parallelism_in_TensorRT-LLM.md.txt | 5 +- ...rmance_Analysis_And_Auto_Enablement.md.txt | 186 + ...t_Parallelism_in_TensorRT-LLM_part2.md.txt | 322 + .../blog9_Deploying_GPT_OSS_on_TRTLLM.md.txt | 362 + _sources/commands/trtllm-serve/index.rst.txt | 9 + .../run-benchmark-with-trtllm-serve.md.txt | 222 + .../trtllm-serve/trtllm-serve.rst.txt | 263 + ...enai_completion_client_json_schema.rst.txt | 2 +- _sources/index.rst.txt | 4 +- _sources/installation/linux.md.txt | 20 +- _sources/llm-api/reference.rst.txt | 12 +- _sources/performance/perf-analysis.md.txt | 4 +- _sources/performance/perf-overview.md.txt | 187 +- _sources/quick-start-guide.md.txt | 97 +- _sources/reference/support-matrix.md.txt | 2 +- _sources/release-notes.md.txt | 1 + .../feature_combination_matrix.md.txt | 6 +- advanced/disaggregated-service.html | 14 +- advanced/executor.html | 14 +- advanced/expert-parallelism.html | 14 +- advanced/gpt-attention.html | 14 +- advanced/gpt-runtime.html | 14 +- advanced/graph-rewriting.html | 14 +- advanced/kv-cache-management.html | 14 +- advanced/kv-cache-reuse.html | 14 +- advanced/lora.html | 14 +- advanced/lowprecision-pcie-allreduce.html | 14 +- advanced/open-sourced-cutlass-kernels.html | 14 +- advanced/speculative-decoding.html | 14 +- advanced/weight-streaming.html | 14 +- architecture/add-model.html | 14 +- architecture/checkpoint.html | 14 +- architecture/core-concepts.html | 14 +- architecture/model-weights-loader.html | 14 +- architecture/overview.html | 20 +- architecture/workflow.html | 14 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 14 +- blogs/Falcon180B-H200.html | 14 +- blogs/H100vsA100.html | 14 +- blogs/H200launch.html | 14 +- blogs/XQA-kernel.html | 14 +- blogs/quantization-in-TRT-LLM.html | 14 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 14 +- ...1_MTP_Implementation_and_Optimization.html | 14 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 14 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 22 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 14 +- .../blog6_Llama4_maverick_eagle_guide.html | 14 +- ...formance_Analysis_And_Auto_Enablement.html | 930 + ...ert_Parallelism_in_TensorRT-LLM_part2.html | 975 + .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 996 + commands/trtllm-bench.html | 18 +- commands/trtllm-build.html | 18 +- commands/trtllm-serve/index.html | 656 + .../run-benchmark-with-trtllm-serve.html | 904 + commands/trtllm-serve/trtllm-serve.html | 1127 + dev-on-cloud/build-image-to-dockerhub.html | 14 +- dev-on-cloud/dev-on-runpod.html | 14 +- examples/curl_chat_client.html | 14 +- examples/curl_chat_client_for_multimodal.html | 14 +- examples/curl_completion_client.html | 14 +- examples/customization.html | 14 +- examples/deepseek_r1_reasoning_parser.html | 14 +- examples/genai_perf_client.html | 14 +- .../genai_perf_client_for_multimodal.html | 14 +- examples/index.html | 14 +- examples/llm_api_examples.html | 14 +- examples/llm_guided_decoding.html | 14 +- examples/llm_inference.html | 14 +- examples/llm_inference_async.html | 14 +- examples/llm_inference_async_streaming.html | 14 +- examples/llm_inference_distributed.html | 14 +- examples/llm_logits_processor.html | 14 +- examples/llm_mgmn_llm_distributed.html | 14 +- examples/llm_mgmn_trtllm_bench.html | 14 +- examples/llm_mgmn_trtllm_serve.html | 14 +- examples/llm_multilora.html | 14 +- examples/llm_runtime.html | 14 +- examples/llm_sampling.html | 14 +- examples/llm_speculative_decoding.html | 14 +- examples/openai_chat_client.html | 14 +- .../openai_chat_client_for_multimodal.html | 14 +- examples/openai_completion_client.html | 14 +- .../openai_completion_client_for_lora.html | 14 +- .../openai_completion_client_json_schema.html | 102 +- examples/trtllm_serve_examples.html | 14 +- genindex.html | 581 +- index.html | 50 +- installation/build-from-source-linux.html | 14 +- installation/containers.html | 16 +- installation/linux.html | 31 +- key-features.html | 14 +- llm-api/index.html | 14 +- llm-api/reference.html | 720 +- objects.inv | Bin 155783 -> 163168 bytes overview.html | 14 +- performance/perf-analysis.html | 18 +- performance/perf-benchmarking.html | 14 +- performance/perf-overview.html | 476 +- .../benchmarking-default-performance.html | 14 +- .../deciding-model-sharding-strategy.html | 14 +- .../fp8-quantization.html | 14 +- .../performance-tuning-guide/index.html | 14 +- ...ing-max-batch-size-and-max-num-tokens.html | 14 +- .../useful-build-time-flags.html | 14 +- .../useful-runtime-flags.html | 14 +- py-modindex.html | 14 +- python-api/tensorrt_llm.functional.html | 14 +- python-api/tensorrt_llm.layers.html | 14 +- python-api/tensorrt_llm.models.html | 14 +- python-api/tensorrt_llm.plugin.html | 14 +- python-api/tensorrt_llm.quantization.html | 14 +- python-api/tensorrt_llm.runtime.html | 14 +- quick-start-guide.html | 142 +- reference/ci-overview.html | 14 +- reference/dev-containers.html | 14 +- reference/memory.html | 14 +- reference/precision.html | 14 +- reference/support-matrix.html | 16 +- reference/troubleshooting.html | 14 +- release-notes.html | 15 +- search.html | 14 +- searchindex.js | 2 +- torch.html | 14 +- torch/adding_new_model.html | 14 +- torch/arch_overview.html | 14 +- torch/attention.html | 14 +- .../features/feature_combination_matrix.html | 20 +- torch/features/overlap_scheduler.html | 14 +- torch/features/quantization.html | 14 +- torch/features/sampling.html | 14 +- torch/kv_cache_manager.html | 14 +- torch/scheduler.html | 14 +- 213 files changed, 22996 insertions(+), 12945 deletions(-) create mode 100644 _sources/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md.txt create mode 100644 _sources/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md.txt create mode 100644 _sources/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md.txt create mode 100644 _sources/commands/trtllm-serve/index.rst.txt create mode 100644 _sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt create mode 100644 _sources/commands/trtllm-serve/trtllm-serve.rst.txt create mode 100644 blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html create mode 100644 blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html create mode 100644 blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html create mode 100644 commands/trtllm-serve/index.html create mode 100644 commands/trtllm-serve/run-benchmark-with-trtllm-serve.html create mode 100644 commands/trtllm-serve/trtllm-serve.html diff --git a/.buildinfo b/.buildinfo index 255f2b4d24..57ce40362c 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 04d569d8861c27285138a24e2af3e496 +config: 4c4e434803756ce4857c43609ad607a5 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html index 6670d5d2e8..dca5128f63 100644 --- a/_cpp_gen/executor.html +++ b/_cpp_gen/executor.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -391,7 +391,11 @@
Architecture
Public Functions
+Constructs a DisaggExecutorOrchestrator object.
+ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
+requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
+requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
+timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
+Await for generation responses.
+timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
+Indicates if the current process is allowed to enqueueRequests.
+Get context executors.
+Get generation executors.
+Private Members
+Public Functions
+Public Types
+Public Types
+Public Functions
+Returns a pointer to underlying array.
+Returns a pointer to underlying array.
+Returns the memory type of the buffer.
+Returns the number of elements in the tensor.
+Returns the size of the tensor in bytes.
+Set the entire memory to zero.
+stream – Must be a valid CUDA stream if the memory type is GPU.
+Copy the data and shape from another tensor.
+other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
+Allocate a cpu tensor with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
+shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
+ + +Typedefs
+Public Static Functions
+Public Functions
-Public Functions
-Public Functions
-Constructs a DisaggExecutorOrchestrator object.
-ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
-requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
-requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
-timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
-Await for generation responses.
-timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
-Indicates if the current process is allowed to enqueueRequests.
-Get context executors.
-Get generation executors.
-Private Members
-Public Functions
-Public Types
-Public Types
-Public Functions
-Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns the memory type of the buffer.
-Returns the number of elements in the tensor.
-Returns the size of the tensor in bytes.
-Set the entire memory to zero.
-stream – Must be a valid CUDA stream if the memory type is GPU.
-Copy the data and shape from another tensor.
-other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
-Allocate a cpu tensor with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
-shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
- - -Public Static Functions
+Public Functions
Public Functions
+Public Types
-Public Functions
-Public Members
-Mandatory parameters.
-GPT decoder class with support for in-flight batching.
-Subclassed by tensorrt_llm::runtime::GptDecoderBatched
-Public Types
-Public Functions
-Setup the decoder before calling forward()
Disable Lookahead decoding.
-Run one step for all requests without blocking the host process and return the token for synchronization.
-Run one step for all requests and wait for completion on the host.
-Gather final beam search results for request batchIdx. Result will only be available after event returned.
Public Types
Public Functions
Public Members
[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-Mandatory parameters Logits
+Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]
Maximum number of decoding tokens of active slots.
+The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.
Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize].
+New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
A Vector of views on newTokensSteps for each token [BS, BM].
+Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
+Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
+Public Static Attributes
+Public Functions
+Public Functions
-Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
-flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
Pass an existing cuda stream to this object.
-stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
Construct with an existing cuda stream or the default stream by passing nullptr.
-Returns the device on which the stream was created.
-Returns the stream associated with this object.
-Synchronizes the stream.
-Private Types
+Public Types
Public Types
+ + +Public Functions
+A helper class for managing memory on host and device.
+Public Types
+ + + + +Public Functions
+Construct a BufferManager.
+cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+Destructor.
+Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
Allocates an IBuffer of the given size and memory type.
Allocates an ITensor of the given dimensions and memory type.
Create an empty IBuffer of the given memory type. It may be resized later.
Create an empty ITensor of the given memory type. It may be reshaped later.
Set the contents of the given buffer to value.
Copy src to dst.
Copy src to dst.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Get the underlying cuda stream.
+The current size of the memory reserved by the memory pool.
+The current size of the memory used by the memory pool.
+The current size of the memory free in the memory pool.
+Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
Public Static Functions
+Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
Allocates an IBuffer of the given size on the CPU.
Allocates an ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU.
Allocates a pinned ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
Allocates an IBuffer of the given size in UVM.
Allocates an ITensor of the given dimensions in UVM.
Allocates an ITensor of the given dimensions for NVLS.
Private Members
+Friends
+Public Functions
+Public Static Functions
+Private Members
+Public Functions
+Public Members
+Public Types
+ + +Public Functions
+Private Members
+ + +Functions
+Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
+Public Types
+Public Functions
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+Private Members
+Subclassed by tensorrt_llm::runtime::GptDecoder< T >
+Public Types
+ + +Public Functions
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+Public Static Functions
+Public Types
+Public Functions
+Public Members
+ + +Private Functions
+Private Members
+Public Members
+ + + + + + + + + + + + + + +Public Functions
+Public Members
+ + + + + + + + +[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
+Public Functions
+Functions
Public Members
-Public Types
-Values:
+Public Functions
Setup buffers for the decoder excluding speculative decoding.
-Private Types
+Private Members
+Public Functions
+Setup buffers for the cache indirection.
-This is used for beam search on pipeline parallel ranks without a decoder.
-Setup buffers for speculative decoding.
-Disable lookahead decoding.
-[batchSize], number of finished sequences per request, on gpu
-[batchSize, beamWidth], FinishedState value, on gpu
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
-batchIdx – index of the batch
-[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu. In case of beam search, contains the ungathered data.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu.
-batchIdx – index of the batch
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx, on gpu.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-[batchSize, maxBeamWidth], sequence lengths, on gpu
-batchIdx – index of the batch
-[maxBeamWidth], sequence lengths for request batchIdx, on gpu
Get maxTokensPerStep tokens generated in the last forward pass.
-[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
-[batchSize], predicted draft tokens lengths for previous step, on gpu
-[batchSize], predicted draft tokens lengths for next step, on gpu
-[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
-[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
-[maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
-Get the number of tokens for all requests in the batch.
-The number of tokens for all requests in the batch.
-Get the number of tokens for a specific request in the batch.
-batchIdx – The index of the request in the batch.
-The number of tokens for the specified request.
-Set the number of tokens for a specific request in the batch.
+)#CudaVirtualMemoryAllocator::Configuration
batchIdx – The index of the request in the batch.
numTokens – The number of tokens for the specified request.
manager – Manager used to track and manage virtual memories
tag – The tag for allocated memories
mode – Backed storage mode
backStream – The CUDA stream used for restoring memory content Note: Virtual Address Allocation is not async. The stream is not used in allocation.
Get the speculative decoding mode.
-Get the explicit draft tokens buffers.
-Get the eagle buffers.
-Get the lookahead buffers.
-Workspace for beam search in streaming mode.
-Get the generation steps for all requests in the batch.
-The generation steps for all requests in the batch.
-Set the generation steps for all requests in the batch.
-generationSteps – The generation steps for all requests in the batch.
-Stateful inputs for the decoder. Allocated for maxBatchSize slots.
-Stateful outputs for the decoder. Allocated for maxBatchSize slots.
-Private Functions
-Private Members
-Stateful inputs for the decoder. Allocated for maxBatchSize slots.
-Stateful outputs for the decoder. Allocated for maxBatchSize slots.
-[maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token of maxTokensPerStep, on gpu
-Workspace for beam search in streaming mode.
-[batchSize], the num tokens of each request.
-Public Functions
-Public Types
- - -Public Functions
-Public Members
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Public Functions
-Private Members
-Public Functions
-Public Members
-Public Functions
-Public Members
-Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]
-The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.
New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
-A Vector of views on newTokensSteps for each token [BS, BM].
-Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
-Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
-Public Static Attributes
Private Functions
+Private Members
+Friends
+CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation, providing the ability to release and rematerialize the allocation.
+Public Types
+Values:
+Public Functions
+ + +Materialize this CUDAVirtualMemoryChunk. Shall be called only when status() == RELEASED.
+Calls creator.create(), and then configurator.setup() for each configurator in order.
+Stop at the first thrown exception and propagates it.
+Release this CUDAVirtualMemoryChunk. Shall be called only when status() == MATERIALIZED, or materialize() throws. Will be called automatically by destructor if necessary.
+Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order, and then creator.release().
+Never stops early upon exception. The last thrown exception will be propagated, and others logged.
+Test if this CUDAVirtualMemoryChunk is managing a memory block.
+Private Functions
+Private Members
+Private Static Attributes
+CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle:
Map into virtual address
Bind to multicast object
Backup and restore memory content
Subclassed by tensorrt_llm::runtime::MemsetConfigurator, tensorrt_llm::runtime::MulticastConfigurator, tensorrt_llm::runtime::OffloadConfigurator, tensorrt_llm::runtime::UnicastConfigurator
+Public Functions
+CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle, either by creating one locally, or importing one from remote.
+Subclassed by tensorrt_llm::runtime::LocalCreator< count >
+ +Public Functions
Add memory to be managed by this manager.
+The memory and internal state will remain valid if any exception is thrown.
+ +handle – Unique handle provided to reference this memory in remove.
tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.
memory – The CUDAVirtualMemory object.
Creates and adds memory to be managed by this manager. The created memory is automatically materialized.
+The internal state will remain valid if any exception is thrown.
+ +handle – Unique handle provided to reference this memory in remove.
tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.
creator – The creator for the memory.
configurators – The configurators for the memory.
Remove the memory from the manager.
+handle – The handle provided to add.
The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned.
+Call release for CUDAVirtualMemoryChunk
objects with a given tag.
+This function will always call
+CUDAVirtualMemoryChunk::release on all selected objects. The last exception thrown by CUDAVirtualMemoryChunk::release will be rethrown, and others will be logged.
+If any CUDAVirtualMemoryChunk threw an exception during release, it will be removed from the manager. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
tag – the tag to select target memories.
+Number of objects selected.
+Call materialize for CUDAVirtualMemoryChunk
objects with a given tag.
+This function will stop at the first
+CUDAVirtualMemoryChunk::materialize that throws exception, and attempt to roll back previous successful materialize by calling release. The exception thrown by CUDAVirtualMemoryChunk::materialize will be rethrown, and any exception thrown by release will be logged.
+If any CUDAVirtualMemoryChunk threw an exception during materialize or release, it will be removed from the manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
tag – the tag to select target memories.
+Number of objects selected.
+Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list. The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened. This method is only for diagnostic purpose, and should not be called concurrently with other methods.
+The handle list.
+Private Types
+ + +Private Functions
+Private Members
+LocalCreator creates memory allocation locally through cuMemCreate.
+Public Functions
+Public Members
- - - - - - - - - - - -MemsetConfigurator fills the memory with given value.
+Public Functions
+MulticastConfigurator binds the allocation handle to the given multicast object and offset.
+ + +OffloadConfigurator offload the content of the allocation to the backup storage when teardown, and restore the content on the following setup.
+Public Functions
+UnicastConfigurator maps the allocation handle into the specified unicast address range.
+Public Functions
+Public Types
+Public Functions
+Public Members
+Private Functions
+Public Members
+ + + + + + + + + + + + + + + + + + + + + + + + +Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
+Public Functions
+Public Members
+ + + + + + + + +[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
+Functions
+Utility function to print a shape.
+Utility function to print a tensor with its shape.
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+tensorPtr – A possibly null shared ptr.
+A pointer to T const, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+tensorPtr – A possibly null shared ptr.
+A pointer to T, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to const T, possibly nullptr.
+Public Types
+ + + + + + + + + + + + +Public Functions
+Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
+Sets the tensor dimensions. The new size of the tensor will be volume(dims)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+Removes the given unit dimensions from this tensor.
+Adds a unit dimension at the specified position.
+Public Static Functions
+Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.
Returns the volume of the dimensions. Throws if d.nbDims < 0.
Returns the strides of each dimemsion in a Shape.
+Removes the given unit dimension from shape.
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
A new shape without the unit dimension.
+Add a unit dimension to shape at the specified position.
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
A new shape with the added unit dimension.
+Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
A view on the buffer.
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
Whenever – offset overflows or the last dimension offset+size overflows.
+A view of shape [size, the rest dimensions] or [size] when
+return the rest slices at the last dimension when size omitted.
offsetDims – specifies all dimensions.
+Just the block at the point, with shape of [the rest dimensions] or [1] when
+Returns a view on the underlying buffer (or tensor) with the given shape.
tensor – The tensor to view.
shape – The shape of the view.
A view on the tensor.
Returns a view on the underlying tensor which can be independently reshaped.
tensor – The tensor to view.
+A view on the tensor.
Returns a flattened view on the underlying tensor which can be independently reshaped.
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
A flatten view on the tensor.
Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.
data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.
An ITensor.
A convenience function to create a tensor shape with the given dimensions.
+A convenience function for converting a tensor shape to a string.
A convenience function to compare shapes.
+A convenience function to compare shapes.
+Protected Functions
+Friends
+Defines
+Typedefs
+Enums
+ + +Public Functions
+Public Members
+Functions
+Configuration for LoraCachePageManager
+See LoraCache docs for description of pages, slots, and page blocks.
+Public Functions
+Private Members
+Public Functions
+Is my rank the last rank in its pipeline?
+Public Static Functions
+Public Static Attributes
+Private Members
+Functions
+Public Types
+Values:
+Public Functions
+Public Static Functions
+Private Members
+Public Types
+Public Functions
+Public Static Functions
+Private Functions
+Private Members
+Private Static Attributes
+Public Types
+Public Functions
+Creates a new cuda event. The event will be destroyed in the destructor.
+flags – Flags for event creation. By default, event timing is disabled.
+Pass an existing cuda event to this object.
+event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
Synchronizes the event.
+Private Types
+ + +Subclassed by tensorrt_llm::runtime::EagleModule, tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule
+Public Functions
Is my rank the last rank in its pipeline?
+max number of draft tokens that can be accepted by one step of the decoder
+Public Static Functions
-+one more than draft path len for prediction from primary head
+max number of tokens that a request can grow in one step of the decoder
+max number of draft tokens processed by one step of the decoder
++one more than decoding draft tokens for prediction from primary head
+max number of tokens processed by one step of the decoder
+Public Static Attributes
-Private Functions
+Private Members
GPT decoder class with support for in-flight batching.
+Subclassed by tensorrt_llm::runtime::GptDecoderBatched
+Public Types
+Public Functions
+Setup the decoder before calling forward()
Disable Lookahead decoding.
+Run one step for all requests without blocking the host process and return the token for synchronization.
+Run one step for all requests and wait for completion on the host.
+Gather final beam search results for request batchIdx. Result will only be available after event returned.
Public Types
+Public Functions
+Public Members
+[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+Mandatory parameters Logits
+Maximum number of decoding tokens of active slots.
+Public Functions
+Private Members
+Public Types
-Public Functions
Public Members
-Private Functions
-Public Members
- - - - - - - - - - - - - - - - - - - - - - - - -Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
-Public Functions
-Public Members
- - - - - - - - -[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
-A helper class for managing memory on host and device.
-Public Types
- - - - -Public Functions
-Construct a BufferManager.
+)#Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
Destructor.
-Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
Allocates an IBuffer of the given size and memory type.
Allocates an ITensor of the given dimensions and memory type.
Create an empty IBuffer of the given memory type. It may be resized later.
Create an empty ITensor of the given memory type. It may be reshaped later.
Set the contents of the given buffer to value.
Copy src to dst.
Copy src to dst.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Get the underlying cuda stream.
-The current size of the memory reserved by the memory pool.
-The current size of the memory used by the memory pool.
-The current size of the memory free in the memory pool.
-Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
Public Static Functions
-Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
Allocates an IBuffer of the given size on the CPU.
Allocates an ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU.
Allocates a pinned ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
Allocates an IBuffer of the given size in UVM.
Allocates an ITensor of the given dimensions in UVM.
Allocates an ITensor of the given dimensions for NVLS.
Private Members
-Friends
-Functions
-Public Types
-Values:
-Public Functions
-Public Static Functions
-Private Members
-Public Types
-Public Functions
-Public Members
- - -Private Functions
-Private Members
-Public Members
- - - - - - - - - - - - - - -Public Functions
-Public Members
- - - - - - - - -[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
-Public Types
-Public Functions
-Public Static Functions
-Private Functions
-Private Members
-Private Static Attributes
-Public Types
- - -Public Functions
- - -Public Types
- - -Public Functions
-Functions
-Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
-Public Types
-Public Functions
-Pass an existing cuda stream to this object.
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
Construct with an existing cuda stream or the default stream by passing nullptr.
+Returns the device on which the stream was created.
+Returns the stream associated with this object.
+Synchronizes the stream.
+Private Types
+Private Members
Subclassed by tensorrt_llm::runtime::GptDecoder< T >
-Public Types
- - -Public Functions
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-Public Static Functions
-Public Functions
Public Static Functions
-Private Members
Defines
+Public Types
- - -Public Functions
Public Members
Private Types
+Private Functions
+ + +Public Types
+Public Functions
+Public Members
+Mandatory parameters.
+Public Functions
+Public Members
+Public Types
+ + +Public Functions
+Setup buffers for the decoder excluding speculative decoding.
+Setup buffers for the cache indirection.
+This is used for beam search on pipeline parallel ranks without a decoder.
+Setup buffers for speculative decoding.
+Disable lookahead decoding.
+[batchSize], number of finished sequences per request, on gpu
+[batchSize, beamWidth], finished states of type FinishedState, on gpu
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
+batchIdx – index of the batch
+[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu. In case of beam search, contains the ungathered data.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu.
+batchIdx – index of the batch
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx, on gpu.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
+[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
+[maxBeamWidth], cumulative log probabilities (per beam), on gpu
+[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
+[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
+[batchSize, maxBeamWidth], sequence lengths, on gpu
+batchIdx – index of the batch
+[maxBeamWidth], sequence lengths for request batchIdx, on gpu
Get maxTokensPerStep tokens generated in the last forward pass.
+[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
+[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
+[batchSize], predicted draft tokens lengths for previous step, on gpu
+[batchSize], predicted draft tokens lengths for next step, on gpu
+[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
+[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
+Get the number of tokens for all requests in the batch.
+The number of tokens for all requests in the batch.
+Get the number of tokens for a specific request in the batch.
+batchIdx – The index of the request in the batch.
+The number of tokens for the specified request.
+Set the number of tokens for a specific request in the batch.
+batchIdx – The index of the request in the batch.
numTokens – The number of tokens for the specified request.
Get the speculative decoding mode.
+Get the explicit draft tokens buffers.
+Get the eagle buffers.
+Get the lookahead buffers.
+Workspace for beam search in streaming mode.
+Get the generation steps for all requests in the batch.
+The generation steps for all requests in the batch.
+Set the generation steps for all requests in the batch.
+generationSteps – The generation steps for all requests in the batch.
+Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+Private Functions
+Private Members
Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+Workspace for beam search in streaming mode.
+[batchSize], the num tokens of each request.
+Typedefs
- - -Enums
- - -Functions
-Gets a typed pointer to the constant underlying data of the buffer.
-T – The type of the underlying data.
-buffer – The buffer to get a pointer to.
-A pointer to constant T.
Gets a typed pointer to the underlying data of the buffer.
-T – The type of the underlying data.
-buffer – The buffer to get a pointer to.
-A pointer to T.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-T – The type of the underlying data.
-bufferPtr – A possibly null shared ptr.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-T – The type of the underlying data.
-bufferPtr – A possibly null shared ptr.
-A pointer to const T, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to const T, possibly nullptr.
-A wrapper around nvinfer1::DataType that provides a support for pointer types.
Public Functions
-Public Static Attributes
- - -Public Types
-For converting a TensorRT data type to a C++ data type.
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Subclassed by tensorrt_llm::runtime::ITensor
-Public Types
- - - - - - - - - - -Public Functions
-Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns a pointer to the underlying array at a given element index.
-Returns a pointer to the underlying array at a given element index.
-Returns the size (in number of elements) of the buffer.
-Returns the size (in bytes) of the buffer.
-Returns the capacity of the buffer.
-Returns the memory type of the buffer.
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-Releases the buffer. It will be reset to nullptr.
-Public Static Functions
- - -Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer.
Returns a view on the underlying tensor which can be independently resized.
tensor – The tensor to view.
-A view on the tensor.
Returns a view on the underlying tensor with a different size.
tensor – The tensor to view.
size – The size of the view.
A view on the tensor.
Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.
data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.
An IBuffer.
Determine the memory type of a pointer.
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-For converting a C++ data type to a TensorRT data type.
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Private Static Attributes
-Public Static Attributes
-Public Functions
Public Static Functions
Private Members
Functions
-Configuration for LoraCachePageManager
-See LoraCache docs for description of pages, slots, and page blocks.
-Public Functions
-Private Members
-