From 1b7ccbecd70e3a69113128a9d0abcbd554b91d2f Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Tue, 8 Jul 2025 02:03:18 +0000 Subject: [PATCH] Update latest GitHub pages to v1.0.0rc2 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 6590 +++--- latest/_cpp_gen/runtime.html | 18347 ++++++++-------- .../attention.py | 24 +- .../llm_args.py | 251 +- latest/_modules/index.html | 71 +- latest/_modules/tensorrt_llm/builder.html | 71 +- .../tensorrt_llm/disaggregated_params.html | 71 +- .../tensorrt_llm/executor/result.html | 74 +- .../_modules/tensorrt_llm/executor/utils.html | 71 +- latest/_modules/tensorrt_llm/functional.html | 71 +- .../tensorrt_llm/layers/activation.html | 71 +- .../tensorrt_llm/layers/attention.html | 71 +- latest/_modules/tensorrt_llm/layers/cast.html | 71 +- latest/_modules/tensorrt_llm/layers/conv.html | 71 +- .../tensorrt_llm/layers/embedding.html | 71 +- .../_modules/tensorrt_llm/layers/linear.html | 71 +- latest/_modules/tensorrt_llm/layers/mlp.html | 71 +- .../tensorrt_llm/layers/normalization.html | 71 +- .../_modules/tensorrt_llm/layers/pooling.html | 71 +- .../tensorrt_llm/llmapi/build_cache.html | 71 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 76 +- .../tensorrt_llm/llmapi/llm_args.html | 345 +- .../tensorrt_llm/llmapi/mpi_session.html | 71 +- .../tensorrt_llm/models/baichuan/model.html | 71 +- .../tensorrt_llm/models/bert/model.html | 71 +- .../tensorrt_llm/models/bloom/model.html | 71 +- .../tensorrt_llm/models/chatglm/config.html | 71 +- .../tensorrt_llm/models/chatglm/model.html | 71 +- .../tensorrt_llm/models/clip/model.html | 71 +- .../tensorrt_llm/models/cogvlm/config.html | 71 +- .../tensorrt_llm/models/cogvlm/model.html | 71 +- .../tensorrt_llm/models/commandr/model.html | 71 +- .../tensorrt_llm/models/dbrx/config.html | 71 +- .../tensorrt_llm/models/dbrx/model.html | 71 +- .../models/deepseek_v1/model.html | 71 +- .../models/deepseek_v2/model.html | 71 +- .../tensorrt_llm/models/dit/model.html | 71 +- .../tensorrt_llm/models/eagle/model.html | 71 +- .../tensorrt_llm/models/enc_dec/model.html | 71 +- .../tensorrt_llm/models/falcon/config.html | 71 +- .../tensorrt_llm/models/falcon/model.html | 71 +- .../tensorrt_llm/models/gemma/config.html | 71 +- .../tensorrt_llm/models/gemma/model.html | 71 +- .../tensorrt_llm/models/gpt/config.html | 71 +- .../tensorrt_llm/models/gpt/model.html | 71 +- .../tensorrt_llm/models/gptj/config.html | 71 +- .../tensorrt_llm/models/gptj/model.html | 71 +- .../tensorrt_llm/models/gptneox/model.html | 71 +- .../tensorrt_llm/models/llama/config.html | 71 +- .../tensorrt_llm/models/llama/model.html | 71 +- .../tensorrt_llm/models/mamba/model.html | 71 +- .../tensorrt_llm/models/medusa/config.html | 71 +- .../tensorrt_llm/models/medusa/model.html | 71 +- .../tensorrt_llm/models/mllama/model.html | 71 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 71 +- .../tensorrt_llm/models/modeling_utils.html | 73 +- .../tensorrt_llm/models/mpt/model.html | 71 +- .../models/multimodal_encoders/config.html | 71 +- .../models/multimodal_encoders/model.html | 71 +- .../tensorrt_llm/models/opt/model.html | 71 +- .../tensorrt_llm/models/phi/model.html | 71 +- .../tensorrt_llm/models/phi3/model.html | 71 +- .../models/recurrentgemma/model.html | 71 +- .../tensorrt_llm/models/redrafter/model.html | 71 +- .../_modules/tensorrt_llm/plugin/plugin.html | 71 +- .../tensorrt_llm/quantization/mode.html | 71 +- .../quantization/quantize_by_modelopt.html | 71 +- .../runtime/enc_dec_model_runner.html | 71 +- .../tensorrt_llm/runtime/generation.html | 71 +- .../runtime/kv_cache_manager.html | 71 +- .../tensorrt_llm/runtime/model_runner.html | 71 +- .../runtime/model_runner_cpp.html | 71 +- .../runtime/multimodal_model_runner.html | 93 +- .../tensorrt_llm/runtime/session.html | 71 +- .../tensorrt_llm/sampling_params.html | 71 +- latest/_sources/_cpp_gen/executor.rst.txt | 42 +- latest/_sources/_cpp_gen/runtime.rst.txt | 360 +- ...tice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt | 90 +- ...MTP_Implementation_and_Optimization.md.txt | 4 +- ..._Expert_Parallelism_in_TensorRT-LLM.md.txt | 4 +- .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../deepseek_r1_reasoning_parser.rst.txt | 2 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- latest/_sources/examples/index.rst.txt | 19 +- .../examples/llm_api_examples.rst.txt | 39 +- .../examples/llm_auto_parallel.rst.txt | 8 - .../examples/llm_eagle2_decoding.rst.txt | 8 - .../examples/llm_eagle_decoding.rst.txt | 8 - .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 4 +- .../llm_inference_async_streaming.rst.txt | 4 +- .../examples/llm_inference_customize.rst.txt | 8 - .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_inference_kv_events.rst.txt | 8 - .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_lookahead_decoding.rst.txt | 8 - .../examples/llm_medusa_decoding.rst.txt | 8 - .../examples/llm_mgmn_llm_distributed.rst.txt | 6 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 6 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 6 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- .../examples/llm_quantization.rst.txt | 8 - .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 6 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 10 + .../examples/trtllm_serve_examples.rst.txt | 6 +- latest/_sources/index.rst.txt | 3 +- .../build-from-source-linux.md.txt | 23 +- .../_sources/installation/containers.md.txt | 10 + .../_sources/installation/grace-hopper.md.txt | 20 - latest/_sources/installation/linux.md.txt | 47 +- latest/_sources/llm-api/index.md.txt | 74 +- latest/_sources/llm-api/reference.rst.txt | 6 + .../_sources/performance/perf-overview.md.txt | 36 +- latest/_sources/quick-start-guide.md.txt | 31 +- .../_sources/reference/dev-containers.md.txt | 100 + latest/_sources/torch.md.txt | 45 +- .../feature_combination_matrix.md.txt | 18 + .../torch/features/overlap_scheduler.md.txt | 24 + .../torch/features/quantization.md.txt | 18 + .../_sources/torch/features/sampling.md.txt | 20 + latest/_static/togglebutton.css | 160 + latest/_static/togglebutton.js | 187 + latest/advanced/disaggregated-service.html | 71 +- latest/advanced/executor.html | 71 +- latest/advanced/expert-parallelism.html | 71 +- latest/advanced/gpt-attention.html | 71 +- latest/advanced/gpt-runtime.html | 71 +- latest/advanced/graph-rewriting.html | 71 +- latest/advanced/kv-cache-management.html | 71 +- latest/advanced/kv-cache-reuse.html | 71 +- latest/advanced/lora.html | 71 +- .../advanced/lowprecision-pcie-allreduce.html | 71 +- .../open-sourced-cutlass-kernels.html | 71 +- latest/advanced/speculative-decoding.html | 71 +- latest/advanced/weight-streaming.html | 71 +- latest/architecture/add-model.html | 71 +- latest/architecture/checkpoint.html | 71 +- latest/architecture/core-concepts.html | 71 +- latest/architecture/model-weights-loader.html | 71 +- latest/architecture/overview.html | 71 +- latest/architecture/workflow.html | 71 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 165 +- latest/blogs/Falcon180B-H200.html | 71 +- latest/blogs/H100vsA100.html | 77 +- latest/blogs/H200launch.html | 71 +- latest/blogs/XQA-kernel.html | 71 +- latest/blogs/quantization-in-TRT-LLM.html | 71 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 71 +- ...1_MTP_Implementation_and_Optimization.html | 75 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 71 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 75 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 71 +- latest/commands/trtllm-build.html | 71 +- latest/commands/trtllm-serve.html | 75 +- .../build-image-to-dockerhub.html | 71 +- latest/dev-on-cloud/dev-on-runpod.html | 71 +- latest/examples/curl_chat_client.html | 71 +- .../curl_chat_client_for_multimodal.html | 71 +- latest/examples/curl_completion_client.html | 71 +- latest/examples/customization.html | 77 +- .../deepseek_r1_reasoning_parser.html | 71 +- latest/examples/genai_perf_client.html | 71 +- .../genai_perf_client_for_multimodal.html | 71 +- latest/examples/index.html | 98 +- latest/examples/llm_api_examples.html | 145 +- latest/examples/llm_eagle2_decoding.html | 718 - latest/examples/llm_eagle_decoding.html | 723 - latest/examples/llm_guided_decoding.html | 174 +- latest/examples/llm_inference.html | 154 +- latest/examples/llm_inference_async.html | 175 +- .../llm_inference_async_streaming.html | 213 +- latest/examples/llm_inference_customize.html | 719 - .../examples/llm_inference_distributed.html | 173 +- latest/examples/llm_inference_kv_events.html | 711 - latest/examples/llm_logits_processor.html | 315 +- latest/examples/llm_medusa_decoding.html | 756 - latest/examples/llm_mgmn_llm_distributed.html | 176 +- latest/examples/llm_mgmn_trtllm_bench.html | 254 +- latest/examples/llm_mgmn_trtllm_serve.html | 180 +- latest/examples/llm_multilora.html | 194 +- latest/examples/llm_quantization.html | 744 - latest/examples/openai_chat_client.html | 116 +- .../openai_chat_client_for_multimodal.html | 306 +- latest/examples/openai_completion_client.html | 110 +- ...=> openai_completion_client_for_lora.html} | 166 +- latest/examples/trtllm_serve_examples.html | 81 +- latest/genindex.html | 163 +- latest/index.html | 85 +- .../installation/build-from-source-linux.html | 112 +- .../{grace-hopper.html => containers.html} | 149 +- latest/installation/linux.html | 118 +- latest/key-features.html | 71 +- latest/llm-api/index.html | 171 +- latest/llm-api/reference.html | 371 +- latest/objects.inv | Bin 147331 -> 147647 bytes latest/overview.html | 71 +- latest/performance/perf-analysis.html | 71 +- latest/performance/perf-benchmarking.html | 71 +- latest/performance/perf-overview.html | 107 +- .../benchmarking-default-performance.html | 71 +- .../deciding-model-sharding-strategy.html | 71 +- .../fp8-quantization.html | 71 +- .../performance-tuning-guide/index.html | 71 +- ...ing-max-batch-size-and-max-num-tokens.html | 71 +- .../useful-build-time-flags.html | 71 +- .../useful-runtime-flags.html | 71 +- latest/py-modindex.html | 71 +- .../python-api/tensorrt_llm.functional.html | 71 +- latest/python-api/tensorrt_llm.layers.html | 77 +- latest/python-api/tensorrt_llm.models.html | 71 +- latest/python-api/tensorrt_llm.plugin.html | 71 +- .../python-api/tensorrt_llm.quantization.html | 71 +- latest/python-api/tensorrt_llm.runtime.html | 71 +- latest/quick-start-guide.html | 110 +- latest/reference/ci-overview.html | 77 +- .../dev-containers.html} | 286 +- latest/reference/memory.html | 71 +- latest/reference/precision.html | 71 +- latest/reference/support-matrix.html | 71 +- latest/reference/troubleshooting.html | 71 +- latest/release-notes.html | 77 +- latest/scripts/disaggregated/README.html | 71 +- latest/search.html | 71 +- latest/searchindex.js | 2 +- latest/torch.html | 117 +- latest/torch/adding_new_model.html | 71 +- latest/torch/arch_overview.html | 71 +- latest/torch/attention.html | 71 +- .../features/feature_combination_matrix.html | 866 + latest/torch/features/overlap_scheduler.html | 666 + latest/torch/features/quantization.html | 632 + latest/torch/features/sampling.html | 634 + latest/torch/kv_cache_manager.html | 71 +- latest/torch/scheduler.html | 71 +- 241 files changed, 22753 insertions(+), 26564 deletions(-) delete mode 100644 latest/_sources/examples/llm_auto_parallel.rst.txt delete mode 100644 latest/_sources/examples/llm_eagle2_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_eagle_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_inference_customize.rst.txt delete mode 100644 latest/_sources/examples/llm_inference_kv_events.rst.txt delete mode 100644 latest/_sources/examples/llm_lookahead_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_medusa_decoding.rst.txt delete mode 100644 latest/_sources/examples/llm_quantization.rst.txt create mode 100644 latest/_sources/examples/openai_completion_client_for_lora.rst.txt create mode 100644 latest/_sources/installation/containers.md.txt delete mode 100644 latest/_sources/installation/grace-hopper.md.txt create mode 100644 latest/_sources/reference/dev-containers.md.txt create mode 100644 latest/_sources/torch/features/feature_combination_matrix.md.txt create mode 100644 latest/_sources/torch/features/overlap_scheduler.md.txt create mode 100644 latest/_sources/torch/features/quantization.md.txt create mode 100644 latest/_sources/torch/features/sampling.md.txt create mode 100644 latest/_static/togglebutton.css create mode 100644 latest/_static/togglebutton.js delete mode 100644 latest/examples/llm_eagle2_decoding.html delete mode 100644 latest/examples/llm_eagle_decoding.html delete mode 100644 latest/examples/llm_inference_customize.html delete mode 100644 latest/examples/llm_inference_kv_events.html delete mode 100644 latest/examples/llm_medusa_decoding.html delete mode 100644 latest/examples/llm_quantization.html rename latest/examples/{llm_auto_parallel.html => openai_completion_client_for_lora.html} (70%) rename latest/installation/{grace-hopper.html => containers.html} (71%) rename latest/{examples/llm_lookahead_decoding.html => reference/dev-containers.html} (59%) create mode 100644 latest/torch/features/feature_combination_matrix.html create mode 100644 latest/torch/features/overlap_scheduler.html create mode 100644 latest/torch/features/quantization.html create mode 100644 latest/torch/features/sampling.html diff --git a/latest/.buildinfo b/latest/.buildinfo index fff48eff5a..7eaa80657f 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 812228e223c943ca4d4a375a1c33a00f +config: cb3cbe8a473ef8fd1cf27e6890eb63f4 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index aeaf7f62bd..5dbf38d7a5 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -35,6 +35,7 @@ + @@ -47,11 +48,17 @@ + + + + + + @@ -63,7 +70,7 @@ - + @@ -318,58 +325,32 @@
Installation
pipLLM API
Examples
Blogs
Public Functions
-Constructs a DisaggExecutorOrchestrator object.
-ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
-requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
-requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
-timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
-Await for generation responses.
-timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
-Indicates if the current process is allowed to enqueueRequests.
-Get context executors.
-Get generation executors.
-Private Members
-Public Functions
-Typedefs
-Enums
- - - - -Functions
-Public Functions
-Private Members
-Public Functions
-Public Functions
-Public Static Functions
-Private Members
-Private Static Functions
-Public Functions
-Public Static Functions
-Public Functions
-Public Functions
-Private Members
-Public Static Functions
-Public Functions
+Constructs a DisaggExecutorOrchestrator object.
+ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
+requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
+requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
+timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
+Await for generation responses.
+timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
+Indicates if the current process is allowed to enqueueRequests.
+Get context executors.
+Get generation executors.
+Private Members
+Public Functions
+Public Functions
+Private Members
+Friends
+Public Functions
+Public Types
+ + +Public Functions
+Private Members
+Friends
+Public Functions
+Public Functions
+Public Members
+Public Functions
+Public Members
+Public Functions
+Private Members
+Friends
+Public Functions
+ + +Public Members
+Public Functions
+Public Functions
The data corresponding to this event.
The sliding window size.
+Public Functions
+Public Static Functions
Private Members
-Friends
+Public Functions
-Public Types
- - -Public Functions
-Private Members
-Friends
-Public Functions
-Public Functions
-Public Members
-Public Functions
-Public Members
-Public Functions
-Private Members
-Friends
-Public Functions
- - -Public Members
-Public Functions
-Typedefs
+Enums
+ + + + +Functions
+Public Functions
+Private Members
+Public Functions
+Public Functions
+Public Static Functions
+Private Members
+Private Static Functions
+Public Functions
+Public Static Functions
+Public Functions
+Public Functions
+Private Members
+Installation
pipLLM API
Examples
Blogs
Public Types
-Public Functions
+Public Static Functions
+Private Members
+Public Functions
Public Functions
+Is my rank the last rank in its pipeline?
+Public Static Functions
+Public Static Attributes
+Private Members
+Defines
+Typedefs
+Enums
+ + +Public Functions
+Public Members
Functions
+Public Functions
+Public Types
- -Public Functions
Public Static Attributes
+Private Functions
+GPT decoder class with support for in-flight batching.
+Subclassed by tensorrt_llm::runtime::GptDecoderBatched
+Public Types
+Public Functions
+Setup the decoder before calling forward()
Disable Lookahead decoding.
+Run one step for all requests without blocking the host process and return the token for synchronization.
+Run one step for all requests and wait for completion on the host.
+Gather final beam search results for request batchIdx. Result will only be available after event returned.
Public Types
+Public Functions
+Public Members
+[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+Mandatory parameters Logits
+Maximum number of decoding tokens of active slots.
+Public Types
+Public Functions
+Public Members
+ + +Private Functions
+Private Members
+Public Members
+ + + + + + + + + + + + + + +Public Functions
+Public Members
+ + + + + + + + +[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
+Defines
+Public Functions
+Public Members
+Private Types
+Private Functions
+ + +Public Types
+Public Functions
+Public Static Functions
+Private Functions
+Private Members
+Private Static Attributes
+Public Functions
+Public Static Functions
+Private Members
+Public Functions
+Public Members
+Public Functions
+Public Members
+Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]
+The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.
New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
+A Vector of views on newTokensSteps for each token [BS, BM].
+Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
+Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
+Public Static Attributes
+Public Functions
+Public Functions
+Public Members
+Public Types
+ + +Public Functions
+Setup buffers for the decoder excluding speculative decoding.
+Setup buffers for the cache indirection.
+This is used for beam search on pipeline parallel ranks without a decoder.
+Setup buffers for speculative decoding.
+Disable lookahead decoding.
+[batchSize], number of finished sequences per request, on gpu
+[batchSize, beamWidth], FinishedState value, on gpu
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
+batchIdx – index of the batch
+[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu. In case of beam search, contains the ungathered data.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu.
+batchIdx – index of the batch
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx, on gpu.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
+[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
+[maxBeamWidth], cumulative log probabilities (per beam), on gpu
+[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
+[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
+[batchSize, maxBeamWidth], sequence lengths, on gpu
+batchIdx – index of the batch
+[maxBeamWidth], sequence lengths for request batchIdx, on gpu
Get maxTokensPerStep tokens generated in the last forward pass.
+[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
+[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
+[batchSize], predicted draft tokens lengths for previous step, on gpu
+[batchSize], predicted draft tokens lengths for next step, on gpu
+[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
+[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
+[maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
+Get the number of tokens for all requests in the batch.
+The number of tokens for all requests in the batch.
+Get the number of tokens for a specific request in the batch.
+batchIdx – The index of the request in the batch.
+The number of tokens for the specified request.
+Set the number of tokens for a specific request in the batch.
+batchIdx – The index of the request in the batch.
numTokens – The number of tokens for the specified request.
Get the speculative decoding mode.
+Get the explicit draft tokens buffers.
+Get the eagle buffers.
+Get the lookahead buffers.
+Workspace for beam search in streaming mode.
+Get the generation steps for all requests in the batch.
+The generation steps for all requests in the batch.
+Set the generation steps for all requests in the batch.
+generationSteps – The generation steps for all requests in the batch.
+Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+Private Functions
+Private Members
+Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+[maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token of maxTokensPerStep, on gpu
+Workspace for beam search in streaming mode.
+[batchSize], the num tokens of each request.
+Functions
+Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
+Public Types
+Public Functions
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+Private Members
+Subclassed by tensorrt_llm::runtime::GptDecoder< T >
+Public Types
+ + +Public Functions
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+Public Static Functions
+Public Types
+Public Functions
+Public Members
+Private Functions
+Public Members
+ + + + + + + + + + + + + + + + + + + + + + + + +Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
+Public Functions
+Public Members
+ + + + + + + + +[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
+Represents the inputs to the decoder.
+This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
+Public Types
+Public Functions
+Public Members
+Mandatory parameters The index of the decoding step we are on. Only used in Python runtime
+The maximum number of tokens to decode.
+The maximum length of the attention window to consider while decoding.
+The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3.
+The number of samples in the batch.
+The beam widths of each request, [batchSize].
+The maximum value in the stopWordsLens tensor.
The maximum value in the badWordsLens tensor.
The output of the model forward computation, a probability distribution over the vocabulary [batchSize][numGenTokens, beamWidth, vocabSizePadded] on gpu
+The end ids, [batchSize * beamWidth] on gpu.
+Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t.
+Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu
+The maximum sequence length for each sequence in the batch, [batchSize] on gpu.
+Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu
+Steps of each request, for Variable-Beam-Width-Search, [batchSize].
+Public Members
+Public Members
+Public Members
+ + + + + + + + + + + + + + + + +Public Members
+[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
+[batchSize, maxTokensPerStep], on gpu
+[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
+[batchSize], on gpu
+A helper class for managing memory on host and device.
+Public Types
+ + + + +Public Functions
+Construct a BufferManager.
+cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+Destructor.
+Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
Allocates an IBuffer of the given size and memory type.
Allocates an ITensor of the given dimensions and memory type.
Create an empty IBuffer of the given memory type. It may be resized later.
Create an empty ITensor of the given memory type. It may be reshaped later.
Set the contents of the given buffer to value.
Copy src to dst.
Copy src to dst.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Get the underlying cuda stream.
+The current size of the memory reserved by the memory pool.
+The current size of the memory used by the memory pool.
+The current size of the memory free in the memory pool.
+Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
Public Static Functions
+Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
Allocates an IBuffer of the given size on the CPU.
Allocates an ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU.
Allocates a pinned ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
Allocates an IBuffer of the given size in UVM.
Allocates an ITensor of the given dimensions in UVM.
Allocates an ITensor of the given dimensions for NVLS.
Private Members
+Friends
+Public Types
+ + +Public Functions
+Private Members
+ + +Typedefs
- - -Enums
- - -Functions
Gets a typed pointer to the constant underlying data of the buffer.
-T – The type of the underlying data.
-buffer – The buffer to get a pointer to.
-A pointer to constant T.
Gets a typed pointer to the underlying data of the buffer.
-T – The type of the underlying data.
-buffer – The buffer to get a pointer to.
-A pointer to T.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-T – The type of the underlying data.
-bufferPtr – A possibly null shared ptr.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-T – The type of the underlying data.
-bufferPtr – A possibly null shared ptr.
-A pointer to const T, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to const T, possibly nullptr.
-Utility function to print a buffer.
-A wrapper around nvinfer1::DataType that provides a support for pointer types.
Public Functions
-Public Types
+Values:
+Public Static Attributes
-Public Functions
+Public Static Functions
+Private Members
Public Types
Public Functions
- -Public Members
+Mandatory parameters.
+Public Functions
Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
+flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
Pass an existing cuda stream to this object.
+stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
Construct with an existing cuda stream or the default stream by passing nullptr.
+Returns the device on which the stream was created.
+Returns the stream associated with this object.
+Synchronizes the stream.
+For converting a TensorRT data type to a C++ data type.
-Public Types
+Private Types
Public Static Attributes
+Private Members
Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Public Types
-Subclassed by tensorrt_llm::runtime::ITensor
+Public Functions
+Private Members
+Public Types
- - - - - - - -Public Functions
Returns a pointer to underlying array.
+Creates a new cuda event. The event will be destroyed in the destructor.
+flags – Flags for event creation. By default, event timing is disabled.
+Returns a pointer to underlying array.
+Pass an existing cuda event to this object.
+event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
Returns a pointer to the underlying array at a given element index.
+Returns the event associated with this object.
Returns a pointer to the underlying array at a given element index.
-Returns the size (in number of elements) of the buffer.
-Returns the size (in bytes) of the buffer.
-Returns the capacity of the buffer.
-Returns the memory type of the buffer.
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-Releases the buffer. It will be reset to nullptr.
-Public Static Functions
-Private Types
+ -Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer.
Returns a view on the underlying tensor which can be independently resized.
tensor – The tensor to view.
-A view on the tensor.
Returns a view on the underlying tensor with a different size.
tensor – The tensor to view.
size – The size of the view.
A view on the tensor.
Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.
data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.
An IBuffer.
Determine the memory type of a pointer.
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-For converting a C++ data type to a TensorRT data type.
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Public Static Attributes
-Private Static Attributes
+Private Members
Public Functions
+Private Members
+Public Static Attributes
-Public Members
-Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]
-The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.
New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
-A Vector of views on newTokensSteps for each token [BS, BM].
-Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
-Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
-Public Static Attributes
-Public Functions
-Public Types
- - -Public Functions
+Functions
Public Types
- - -Public Functions
A helper class for managing memory on host and device.
-Public Types
-Public Functions
Construct a BufferManager.
-cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
-Destructor.
-Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
Allocates an IBuffer of the given size and memory type.
Allocates an ITensor of the given dimensions and memory type.
Create an empty IBuffer of the given memory type. It may be resized later.
Create an empty ITensor of the given memory type. It may be reshaped later.
Set the contents of the given buffer to value.
Copy src to dst.
Copy src to dst.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new IBuffer with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Copy src into a new ITensor with a potentially different memory type.
Get the underlying cuda stream.
-The current size of the memory reserved by the memory pool.
-The current size of the memory used by the memory pool.
-The current size of the memory free in the memory pool.
-Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
Public Static Functions
-Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
Allocates an IBuffer of the given size on the CPU.
Allocates an ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU.
Allocates a pinned ITensor of the given dimensions on the CPU.
Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
Allocates an IBuffer of the given size in UVM.
Allocates an ITensor of the given dimensions in UVM.
Allocates an ITensor of the given dimensions for NVLS.
Public Static Attributes
-Private Members
Public Members
+Functions
+Utility function to print a shape.
+Utility function to print a tensor with its shape.
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+tensorPtr – A possibly null shared ptr.
+A pointer to T const, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+tensorPtr – A possibly null shared ptr.
+A pointer to T, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to const T, possibly nullptr.
+Public Types
+ + + + + + + + + + + + +Public Functions
+Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
+Sets the tensor dimensions. The new size of the tensor will be volume(dims)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+Removes the given unit dimensions from this tensor.
+Adds a unit dimension at the specified position.
+Public Static Functions
+Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.
Returns the volume of the dimensions. Throws if d.nbDims < 0.
Returns the strides of each dimemsion in a Shape.
+Removes the given unit dimension from shape.
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
A new shape without the unit dimension.
+Add a unit dimension to shape at the specified position.
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
A new shape with the added unit dimension.
+Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
A view on the buffer.
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
Whenever – offset overflows or the last dimension offset+size overflows.
+A view of shape [size, the rest dimensions] or [size] when
+return the rest slices at the last dimension when size omitted.
offsetDims – specifies all dimensions.
+Just the block at the point, with shape of [the rest dimensions] or [1] when
+Returns a view on the underlying buffer (or tensor) with the given shape.
tensor – The tensor to view.
shape – The shape of the view.
A view on the tensor.
Returns a view on the underlying tensor which can be independently reshaped.
tensor – The tensor to view.
+A view on the tensor.
Returns a flattened view on the underlying tensor which can be independently reshaped.
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
A flatten view on the tensor.
Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.
data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.
An ITensor.
A convenience function to create a tensor shape with the given dimensions.
+A convenience function for converting a tensor shape to a string.
A convenience function to compare shapes.
+A convenience function to compare shapes.
+Protected Functions
+Friends
GPT decoder class with support for in-flight batching.
+Public Types
+Public Functions
+Setup the decoder before calling forward()
Disable Lookahead decoding.
+Run one step for all requests without blocking the host process and return the token for synchronization.
+Run one step for all requests and wait for completion on the host.
+Gather final beam search results for request batchSlot. Result will only be available after event returned.
Private Types
+Private Functions
+Calls decoders for tokens per engine step.
+Private Members
+Public Functions
Public Static Functions
-Private Members
Public Functions
-Public Members
-Public Types
- - -Public Functions
-Private Members
- - -Functions
-Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
-Public Types
-Public Functions
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-Private Members
-Subclassed by tensorrt_llm::runtime::GptDecoder< T >
-Public Types
- - -Public Functions
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-Public Static Functions
-Public Types
-Public Functions
-Public Members
- - -Private Functions
-Private Members
-Public Members
- - - - - - - - - - - - - - -Public Functions
-Public Members
- - - - - - - - -[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
-Public Types
- - -Public Functions
-Private Members
-Public Types
-Public Functions
-Public Members
-Private Functions
-Public Members
- - - - - - - - - - - - - - - - - - - - - - - - -Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
-Public Functions
-Public Members
- - - - - - - - -[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
-Functions
-Utility function to print a shape.
-Utility function to print a tensor with its shape.
-Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-tensorPtr – A possibly null shared ptr.
-A pointer to T const, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-tensorPtr – A possibly null shared ptr.
-A pointer to T, possibly nullptr.
-Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to T, possibly nullptr.
-Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-T – The type of the underlying data.
-optionalBufferPtr – A possibly empty optional.
-A pointer to const T, possibly nullptr.
-Public Types
- - - - - - - - - - - - -Public Functions
-Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
-Sets the tensor dimensions. The new size of the tensor will be volume(dims)
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-Removes the given unit dimensions from this tensor.
-Adds a unit dimension at the specified position.
-Public Static Functions
-Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.
Returns the volume of the dimensions. Throws if d.nbDims < 0.
Returns the strides of each dimemsion in a Shape.
-Removes the given unit dimension from shape.
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
A new shape without the unit dimension.
-Add a unit dimension to shape at the specified position.
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
A new shape with the added unit dimension.
-Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
A view on the buffer.
offsetDims – The offset in multiple dimensions.
tensor – The tensor to view.
offsetDims – The offset dimensions of the view.
size – The size of the view w.r.t. the last dimension in offsetDims.
offsetDims – specifies all dimensions.
Whenever – offset overflows or the last dimension offset+size overflows.
-A view of shape [size, the rest dimensions] or [size] when
-return the rest slices at the last dimension when size omitted.
offsetDims – specifies all dimensions.
-Just the block at the point, with shape of [the rest dimensions] or [1] when
-Returns a view on the underlying buffer (or tensor) with the given shape.
tensor – The tensor to view.
shape – The shape of the view.
A view on the tensor.
Returns a view on the underlying tensor which can be independently reshaped.
tensor – The tensor to view.
-A view on the tensor.
Returns a flattened view on the underlying tensor which can be independently reshaped.
tensor – The tensor to flatten.
sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.
A flatten view on the tensor.
Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.
data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.
An ITensor.
A convenience function to create a tensor shape with the given dimensions.
-A convenience function for converting a tensor shape to a string.
A convenience function to compare shapes.
-A convenience function to compare shapes.
-Protected Functions
-Friends
-Defines
-Typedefs
-Enums
- - -Public Functions
-Public Functions
-Is my rank the last rank in its pipeline?
-Public Static Functions
-Public Static Attributes
-Private Members
-Functions
-Public Types
-Values:
-Public Functions
-Public Static Functions
-Private Members
-Public Types
-Public Functions
-Public Static Functions
-Private Functions
-Private Members
-Private Static Attributes
-Public Types
-Public Functions
-Creates a new cuda event. The event will be destroyed in the destructor.
-flags – Flags for event creation. By default, event timing is disabled.
-Pass an existing cuda event to this object.
-event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
Synchronizes the event.
-Private Types
- - -Represents the inputs to the decoder.
-This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.
-Public Types
-Public Functions
-Public Members
-Mandatory parameters The index of the decoding step we are on. Only used in Python runtime
-The maximum number of tokens to decode.
-The maximum length of the attention window to consider while decoding.
-The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3.
-The number of samples in the batch.
-The beam widths of each request, [batchSize].
-The maximum value in the stopWordsLens tensor.
The maximum value in the badWordsLens tensor.
The output of the model forward computation, a probability distribution over the vocabulary [batchSize, beamWidth, vocabSizePadded] on gpu
-Another view on the logits, [batchSize][beamWidth, vocabSizePadded] on gpu.
-The end ids, [batchSize * beamWidth] on gpu.
-Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t.
-Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu
-The maximum sequence length for each sequence in the batch, [batchSize] on gpu.
-Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu
-Steps of each request, for Variable-Beam-Width-Search, [batchSize].
-Public Functions
-Public Members
-Public Members
-Public Members
- - - - - - - - - - - - - - - - -Public Members
-[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu
-[batchSize, maxTokensPerStep], on gpu
-[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu
-[batchSize], on gpu
-GPT decoder class with support for in-flight batching.
-Subclassed by tensorrt_llm::runtime::GptDecoderBatched
-Public Types
-Public Functions
-Setup the decoder before calling forward()
Disable Lookahead decoding.
-Run one step for all requests without blocking the host process and return the token for synchronization.
-Run one step for all requests and wait for completion on the host.
-Gather final beam search results for request batchIdx. Result will only be available after event returned.
Public Types
Public Functions
Public Members
[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-Mandatory parameters Logits
-Maximum number of decoding tokens of active slots.
-Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize].
-For Beam Search The generation step of each request (for Variable-Beam-Width-Search), [batchSize]
-For speculative decoding Logits of draft [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded]
-Explicit draft tokens data.
-Eagle data.
-Public Functions
-Private Members
-GPT decoder class with support for in-flight batching.
-Public Types
Public Functions
Setup the decoder before calling forward()
Disable Lookahead decoding.
-Run one step for all requests without blocking the host process and return the token for synchronization.
-Run one step for all requests and wait for completion on the host.
-Gather final beam search results for request batchSlot. Result will only be available after event returned.
Private Types
-Private Functions
-Calls decoders for tokens per engine step.
-Private Members
+Public Members
Public Functions
-Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
-flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
Pass an existing cuda stream to this object.
-stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
Construct with an existing cuda stream or the default stream by passing nullptr.
-Returns the device on which the stream was created.
-Returns the stream associated with this object.
-Synchronizes the stream.
-Private Types
+Public Types
Public Types
+ + +Public Functions
+Public Types
+ + +Public Functions
+Private Members
+Typedefs
+ + +Enums
+ + +Functions
Gets a typed pointer to the constant underlying data of the buffer.
+T – The type of the underlying data.
+buffer – The buffer to get a pointer to.
+A pointer to constant T.
Gets a typed pointer to the underlying data of the buffer.
+T – The type of the underlying data.
+buffer – The buffer to get a pointer to.
+A pointer to T.
Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
+T – The type of the underlying data.
+bufferPtr – A possibly null shared ptr.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
+T – The type of the underlying data.
+bufferPtr – A possibly null shared ptr.
+A pointer to const T, possibly nullptr.
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to T, possibly nullptr.
+Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+T – The type of the underlying data.
+optionalBufferPtr – A possibly empty optional.
+A pointer to const T, possibly nullptr.
+Public Members
-Defines
-Public Functions
-Public Members
-Private Types
-Private Functions
- - -A wrapper around nvinfer1::DataType that provides a support for pointer types.
Public Functions
Public Members
-Mandatory parameters.
-Public Functions
-Public Members
-Public Types
- - -Public Functions
-Allocate buffers for speculative decoding.
-Setup buffers for the decoder excluding speculative decoding.
-Setup buffers for the cache indirection.
-This is used for beam search on pipeline parallel ranks without a decoder.
-Setup buffers for speculative decoding.
-Disable lookahead decoding.
-[batchSize], number of finished sequences per request, on gpu
-[batchSize, beamWidth], FinishedState value, on gpu
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
-batchIdx – index of the batch
-[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu. In case of beam search, contains the ungathered data.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu.
-batchIdx – index of the batch
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx, on gpu.
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-[batchSize, maxBeamWidth], sequence lengths, on gpu
-batchIdx – index of the batch
-[maxBeamWidth], sequence lengths for request batchIdx, on gpu
Get maxTokensPerStep tokens generated in the last forward pass.
-[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
-[batchSize], predicted draft tokens lengths for previous step, on gpu
-[batchSize], predicted draft tokens lengths for next step, on gpu
-[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
-[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
-[maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
-Get the number of tokens for all requests in the batch.
-The number of tokens for all requests in the batch.
-Get the number of tokens for a specific request in the batch.
-batchIdx – The index of the request in the batch.
-The number of tokens for the specified request.
-Set the number of tokens for a specific request in the batch.
-batchIdx – The index of the request in the batch.
numTokens – The number of tokens for the specified request.
Get the speculative decoding mode.
-Get the explicit draft tokens buffers.
-Get the eagle buffers.
-Get the lookahead buffers.
-Workspace for beam search in streaming mode.
-Stateful inputs for the decoder. Allocated for maxBatchSize slots.
-Stateful outputs for the decoder. Allocated for maxBatchSize slots.
-Private Members
-Stateful inputs for the decoder. Allocated for maxBatchSize slots.
-Stateful outputs for the decoder. Allocated for maxBatchSize slots.
-[maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token of maxTokensPerStep, on gpu
-Workspace for beam search in streaming mode.
-[batchSize], the num tokens of each request.
-Functions
-Public Functions
-Public Functions
-Public Static Attributes
Private Functions
-Public Types
For converting a TensorRT data type to a C++ data type.
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Public Types
+Subclassed by tensorrt_llm::runtime::ITensor
+Public Types
+ + + + + + + + + + +Public Functions
+Returns a pointer to underlying array.
+Returns a pointer to underlying array.
+Returns a pointer to the underlying array at a given element index.
+Returns a pointer to the underlying array at a given element index.
+Returns the size (in number of elements) of the buffer.
+Returns the size (in bytes) of the buffer.
+Returns the capacity of the buffer.
+Returns the memory type of the buffer.
+Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+Releases the buffer. It will be reset to nullptr.
+Not allowed to copy.
+Not allowed to copy.
+Public Static Functions
Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
A view on the buffer.
Returns a view on the underlying tensor which can be independently resized.
tensor – The tensor to view.
+A view on the tensor.
Returns a view on the underlying tensor with a different size.
tensor – The tensor to view.
size – The size of the view.
A view on the tensor.
Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.
data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.
An IBuffer.
Determine the memory type of a pointer.
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+For converting a C++ data type to a TensorRT data type.
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Public Static Attributes
+Private Members
+Private Static Attributes
Public Static Attributes