From 7b17110c12be99664b4271f896b4cef955a3db76 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Wed, 16 Jul 2025 02:09:52 +0000
Subject: [PATCH] Update latest GitHub pages to v1.0.0rc3
---
latest/.buildinfo | 2 +-
latest/_cpp_gen/executor.html | 6497 +++---
latest/_cpp_gen/runtime.html | 18227 ++++++++--------
.../attention.py | 23 +-
.../b6815cf245cc7dc7a26a6f727fdc2dc4/model.py | 16 +-
.../llm_args.py | 224 +-
.../_images/disaggregated-service_usage.png | Bin 39888 -> 0 bytes
latest/_modules/index.html | 11 +-
latest/_modules/tensorrt_llm/builder.html | 11 +-
.../tensorrt_llm/disaggregated_params.html | 11 +-
.../tensorrt_llm/executor/result.html | 32 +-
.../_modules/tensorrt_llm/executor/utils.html | 11 +-
latest/_modules/tensorrt_llm/functional.html | 11 +-
.../tensorrt_llm/layers/activation.html | 11 +-
.../tensorrt_llm/layers/attention.html | 11 +-
latest/_modules/tensorrt_llm/layers/cast.html | 11 +-
latest/_modules/tensorrt_llm/layers/conv.html | 11 +-
.../tensorrt_llm/layers/embedding.html | 11 +-
.../_modules/tensorrt_llm/layers/linear.html | 11 +-
latest/_modules/tensorrt_llm/layers/mlp.html | 11 +-
.../tensorrt_llm/layers/normalization.html | 11 +-
.../_modules/tensorrt_llm/layers/pooling.html | 11 +-
.../tensorrt_llm/llmapi/build_cache.html | 11 +-
latest/_modules/tensorrt_llm/llmapi/llm.html | 64 +-
.../tensorrt_llm/llmapi/llm_args.html | 273 +-
.../tensorrt_llm/llmapi/mpi_session.html | 11 +-
.../tensorrt_llm/models/baichuan/model.html | 11 +-
.../tensorrt_llm/models/bert/model.html | 11 +-
.../tensorrt_llm/models/bloom/model.html | 11 +-
.../tensorrt_llm/models/chatglm/config.html | 11 +-
.../tensorrt_llm/models/chatglm/model.html | 11 +-
.../tensorrt_llm/models/clip/model.html | 11 +-
.../tensorrt_llm/models/cogvlm/config.html | 11 +-
.../tensorrt_llm/models/cogvlm/model.html | 11 +-
.../tensorrt_llm/models/commandr/model.html | 11 +-
.../tensorrt_llm/models/dbrx/config.html | 11 +-
.../tensorrt_llm/models/dbrx/model.html | 11 +-
.../models/deepseek_v1/model.html | 11 +-
.../models/deepseek_v2/model.html | 11 +-
.../tensorrt_llm/models/dit/model.html | 11 +-
.../tensorrt_llm/models/eagle/model.html | 17 +-
.../tensorrt_llm/models/enc_dec/model.html | 11 +-
.../tensorrt_llm/models/falcon/config.html | 11 +-
.../tensorrt_llm/models/falcon/model.html | 11 +-
.../tensorrt_llm/models/gemma/config.html | 11 +-
.../tensorrt_llm/models/gemma/model.html | 19 +-
.../tensorrt_llm/models/gpt/config.html | 11 +-
.../tensorrt_llm/models/gpt/model.html | 11 +-
.../tensorrt_llm/models/gptj/config.html | 11 +-
.../tensorrt_llm/models/gptj/model.html | 11 +-
.../tensorrt_llm/models/gptneox/model.html | 11 +-
.../tensorrt_llm/models/llama/config.html | 11 +-
.../tensorrt_llm/models/llama/model.html | 11 +-
.../tensorrt_llm/models/mamba/model.html | 11 +-
.../tensorrt_llm/models/medusa/config.html | 13 +-
.../tensorrt_llm/models/medusa/model.html | 13 +-
.../tensorrt_llm/models/mllama/model.html | 11 +-
.../tensorrt_llm/models/mmdit_sd3/model.html | 11 +-
.../tensorrt_llm/models/modeling_utils.html | 14 +-
.../tensorrt_llm/models/mpt/model.html | 11 +-
.../models/multimodal_encoders/config.html | 11 +-
.../models/multimodal_encoders/model.html | 11 +-
.../tensorrt_llm/models/opt/model.html | 11 +-
.../tensorrt_llm/models/phi/model.html | 11 +-
.../tensorrt_llm/models/phi3/model.html | 11 +-
.../models/recurrentgemma/model.html | 11 +-
.../tensorrt_llm/models/redrafter/model.html | 17 +-
.../_modules/tensorrt_llm/plugin/plugin.html | 11 +-
.../tensorrt_llm/quantization/mode.html | 11 +-
.../quantization/quantize_by_modelopt.html | 11 +-
.../runtime/enc_dec_model_runner.html | 11 +-
.../tensorrt_llm/runtime/generation.html | 11 +-
.../runtime/kv_cache_manager.html | 11 +-
.../tensorrt_llm/runtime/model_runner.html | 11 +-
.../runtime/model_runner_cpp.html | 11 +-
.../runtime/multimodal_model_runner.html | 11 +-
.../tensorrt_llm/runtime/session.html | 11 +-
.../tensorrt_llm/sampling_params.html | 14 +-
latest/_sources/_cpp_gen/executor.rst.txt | 66 +-
latest/_sources/_cpp_gen/runtime.rst.txt | 346 +-
.../advanced/disaggregated-service.md.txt | 81 +-
...tice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt | 54 +-
...MTP_Implementation_and_Optimization.md.txt | 8 +-
...Throughput_on_NVIDIA_Blackwell_GPUs.md.txt | 8 +-
...saggregated_Serving_in_TensorRT-LLM.md.txt | 15 +
.../blog6_Llama4_maverick_eagle_guide.md.txt | 148 +
.../examples/llm_api_examples.rst.txt | 1 +
.../_sources/examples/llm_inference.rst.txt | 2 +-
.../examples/llm_inference_async.rst.txt | 2 +-
.../llm_inference_async_streaming.rst.txt | 2 +-
.../llm_inference_distributed.rst.txt | 2 +-
.../examples/llm_speculative_decoding.rst.txt | 8 +
latest/_sources/llm-api/index.md.txt | 23 +-
latest/_sources/llm-api/reference.rst.txt | 6 +
latest/_sources/torch/adding_new_model.md.txt | 4 +-
latest/advanced/disaggregated-service.html | 83 +-
latest/advanced/executor.html | 11 +-
latest/advanced/expert-parallelism.html | 11 +-
latest/advanced/gpt-attention.html | 11 +-
latest/advanced/gpt-runtime.html | 11 +-
latest/advanced/graph-rewriting.html | 11 +-
latest/advanced/kv-cache-management.html | 11 +-
latest/advanced/kv-cache-reuse.html | 11 +-
latest/advanced/lora.html | 11 +-
.../advanced/lowprecision-pcie-allreduce.html | 11 +-
.../open-sourced-cutlass-kernels.html | 11 +-
latest/advanced/speculative-decoding.html | 15 +-
latest/advanced/weight-streaming.html | 11 +-
latest/architecture/add-model.html | 11 +-
latest/architecture/checkpoint.html | 11 +-
latest/architecture/core-concepts.html | 11 +-
latest/architecture/model-weights-loader.html | 11 +-
latest/architecture/overview.html | 11 +-
latest/architecture/workflow.html | 11 +-
...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 65 +-
latest/blogs/Falcon180B-H200.html | 11 +-
latest/blogs/H100vsA100.html | 11 +-
latest/blogs/H200launch.html | 11 +-
latest/blogs/XQA-kernel.html | 11 +-
latest/blogs/quantization-in-TRT-LLM.html | 11 +-
...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 11 +-
...1_MTP_Implementation_and_Optimization.html | 19 +-
...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 16 +-
...ng_Expert_Parallelism_in_TensorRT-LLM.html | 11 +-
...Disaggregated_Serving_in_TensorRT-LLM.html | 33 +-
.../blog6_Llama4_maverick_eagle_guide.html | 797 +
latest/commands/trtllm-build.html | 11 +-
latest/commands/trtllm-serve.html | 11 +-
.../build-image-to-dockerhub.html | 11 +-
latest/dev-on-cloud/dev-on-runpod.html | 11 +-
latest/examples/curl_chat_client.html | 11 +-
.../curl_chat_client_for_multimodal.html | 11 +-
latest/examples/curl_completion_client.html | 11 +-
latest/examples/customization.html | 11 +-
.../deepseek_r1_reasoning_parser.html | 11 +-
latest/examples/genai_perf_client.html | 11 +-
.../genai_perf_client_for_multimodal.html | 11 +-
latest/examples/index.html | 23 +-
latest/examples/llm_api_examples.html | 12 +-
latest/examples/llm_guided_decoding.html | 11 +-
latest/examples/llm_inference.html | 53 +-
latest/examples/llm_inference_async.html | 69 +-
.../llm_inference_async_streaming.html | 106 +-
.../examples/llm_inference_distributed.html | 55 +-
latest/examples/llm_logits_processor.html | 11 +-
latest/examples/llm_mgmn_llm_distributed.html | 19 +-
latest/examples/llm_mgmn_trtllm_bench.html | 11 +-
latest/examples/llm_mgmn_trtllm_serve.html | 11 +-
latest/examples/llm_multilora.html | 17 +-
latest/examples/llm_speculative_decoding.html | 735 +
latest/examples/openai_chat_client.html | 11 +-
.../openai_chat_client_for_multimodal.html | 11 +-
latest/examples/openai_completion_client.html | 11 +-
.../openai_completion_client_for_lora.html | 11 +-
latest/examples/trtllm_serve_examples.html | 11 +-
latest/genindex.html | 89 +-
latest/index.html | 16 +-
.../installation/build-from-source-linux.html | 11 +-
latest/installation/containers.html | 13 +-
latest/installation/linux.html | 23 +-
latest/key-features.html | 11 +-
latest/llm-api/index.html | 35 +-
latest/llm-api/reference.html | 327 +-
latest/objects.inv | Bin 147647 -> 148460 bytes
latest/overview.html | 11 +-
latest/performance/perf-analysis.html | 11 +-
latest/performance/perf-benchmarking.html | 11 +-
latest/performance/perf-overview.html | 15 +-
.../benchmarking-default-performance.html | 11 +-
.../deciding-model-sharding-strategy.html | 11 +-
.../fp8-quantization.html | 11 +-
.../performance-tuning-guide/index.html | 11 +-
...ing-max-batch-size-and-max-num-tokens.html | 11 +-
.../useful-build-time-flags.html | 11 +-
.../useful-runtime-flags.html | 11 +-
latest/py-modindex.html | 11 +-
.../python-api/tensorrt_llm.functional.html | 11 +-
latest/python-api/tensorrt_llm.layers.html | 11 +-
latest/python-api/tensorrt_llm.models.html | 21 +-
latest/python-api/tensorrt_llm.plugin.html | 11 +-
.../python-api/tensorrt_llm.quantization.html | 11 +-
latest/python-api/tensorrt_llm.runtime.html | 11 +-
latest/quick-start-guide.html | 25 +-
latest/reference/ci-overview.html | 11 +-
latest/reference/dev-containers.html | 11 +-
latest/reference/memory.html | 11 +-
latest/reference/precision.html | 11 +-
latest/reference/support-matrix.html | 11 +-
latest/reference/troubleshooting.html | 11 +-
latest/release-notes.html | 11 +-
latest/scripts/disaggregated/README.html | 11 +-
latest/search.html | 11 +-
latest/searchindex.js | 2 +-
latest/torch.html | 36 +-
latest/torch/adding_new_model.html | 15 +-
latest/torch/arch_overview.html | 11 +-
latest/torch/attention.html | 11 +-
.../features/feature_combination_matrix.html | 11 +-
latest/torch/features/overlap_scheduler.html | 11 +-
latest/torch/features/quantization.html | 11 +-
latest/torch/features/sampling.html | 11 +-
latest/torch/kv_cache_manager.html | 11 +-
latest/torch/scheduler.html | 11 +-
203 files changed, 16316 insertions(+), 14199 deletions(-)
delete mode 100644 latest/_images/disaggregated-service_usage.png
create mode 100644 latest/_sources/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md.txt
create mode 100644 latest/_sources/examples/llm_speculative_decoding.rst.txt
create mode 100644 latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html
create mode 100644 latest/examples/llm_speculative_decoding.html
diff --git a/latest/.buildinfo b/latest/.buildinfo
index 7eaa80657f..4d83aad3b8 100644
--- a/latest/.buildinfo
+++ b/latest/.buildinfo
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: cb3cbe8a473ef8fd1cf27e6890eb63f4
+config: ee79abf721be5d1b28815a3912832a13
tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html
index 5dbf38d7a5..07cfa3ce9e 100644
--- a/latest/_cpp_gen/executor.html
+++ b/latest/_cpp_gen/executor.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -348,6 +348,7 @@
Generate text with guided decoding
Control generated text using logits processor
Generate text with multiple LoRA adapters
+Speculative Decoding
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -408,7 +409,7 @@
KV Cache Management: Pools, Blocks, and Events
KV cache reuse
Speculative Sampling
-Disaggregated-Service (experimental)
+Disaggregated-Service (Experimental)
Performance
@@ -496,14 +497,279 @@
Executor
-
-tensor.h
+
+disaggServerUtil.h
namespace tensorrt_llm
namespace executor
+
+
+namespace disagg_executor
+
+
+class DisaggExecutorOrchestrator
+
+
Public Functions
+
+
+DisaggExecutorOrchestrator (
+
+
+std :: vector < std :: filesystem :: path > const & ctxEnginePaths ,
+std :: vector < std :: filesystem :: path > const & genEnginePaths ,
+std :: vector < executor :: ExecutorConfig > const & ctxExecutorConfigs ,
+std :: vector < executor :: ExecutorConfig > const & genExecutorConfigs ,
+bool hasContextAwaitThreads ,
+bool hasGenAwaitThreads ,
+
+
+)
+Constructs a DisaggExecutorOrchestrator object.
+
+Parameters:
+
+ctxEnginePaths – A vector of file paths to context engine files.
+genEnginePaths – A vector of file paths to generation engine files.
+ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
+genExecutorConfigs – A vector of ExecutorConfig for generation executors.
+hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
+hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
+
+
+
+
+
+
+
+std :: vector < IdType > enqueueContext (
+
+
+std :: vector < texec :: Request > const & requests ,
+std :: optional < int > selectContextId = std :: nullopt ,
+bool batch = false ,
+
+
+)
+Enqueue context-only requests to context executors.
+
+Parameters:
+
+requests – A vector of context-only requests.
+selectContextId – The index of the context executor to use. If std::nullopt , the executor that has the smallest number of inflight requests will be used.
+batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
+
+
+Returns:
+A vector of global request ids, corresponding to the order of the requests in requests , the id returned may be different from the request id in each executor.
+
+
+
+
+
+
+void enqueueGeneration (
+
+
+std :: vector < texec :: Request > const & requests ,
+std :: vector < IdType > const & globalRequestIds ,
+std :: optional < int > selectGenIdx = std :: nullopt ,
+bool batch = false ,
+
+
+)
+Enqueue generation-only requests to generation executors.
+
+Parameters:
+
+requests – A vector of generation-only requests.
+globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
+selectGenIdx – The index of the generation executor to use. If std::nullopt , the executor that has the smallest number of inflight requests will be used.
+batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
+
+
+
+
+
+
+
+std :: vector < ResponseWithId > awaitContextResponses (
+
+
+std :: optional < std :: chrono :: milliseconds > const & timeout ,
+std :: optional < int > contextIdx = std :: nullopt ,
+
+
+)
+Await for context responses.
+
+Parameters:
+
+timeout – The maximum time to wait for new responses
+contextIdx – The index of the context executor to use. If std::nullopt , return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
+
+
+Returns:
+A vector of responses with corresponding global request ids
+
+
+
+
+
+
+std :: vector < ResponseWithId > awaitGenerationResponses (
+
+
+std :: optional < std :: chrono :: milliseconds > const & timeout ,
+std :: optional < int > genIdx = std :: nullopt ,
+
+
+)
+Await for generation responses.
+
+Parameters:
+
+timeout – The maximum time to wait for new responses.
+genIdx – The index of the generation executor to use. If std::nullopt , return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
+
+
+Returns:
+A vector of responses with corresponding global request ids.
+
+
+
+
+
+
+bool canEnqueue ( ) const
+Indicates if the current process is allowed to enqueueRequests.
+
+
+
+
+std :: vector < std :: unique_ptr < texec :: Executor > > const & getContextExecutors (
+
+
+
+
+) const
+Get context executors.
+
+
+
+
+std :: vector < std :: unique_ptr < texec :: Executor > > const & getGenExecutors (
+
+
+
+
+) const
+Get generation executors.
+
+
+
+
+~DisaggExecutorOrchestrator ( )
+
+
+
+
+
Private Members
+
+
+std :: unique_ptr < Impl > mImpl
+
+
+
+
+
+
+
+struct ResponseWithId
+
+
+
+
+
+
+
+
+
+
+
+
+tensor.h
+
+
+namespace tensorrt_llm
+
+
+namespace executor
class Shape : public tensorrt_llm :: common :: ArrayView < detail :: DimType64 const >
@@ -973,6 +1239,2009 @@
+
+
+transferAgent.h
+
+
+namespace tensorrt_llm
+
+
+namespace executor
+
+
+namespace kv_cache
+
+
Typedefs
+
+
+using TransferDescs = MemoryDescs
+
+
+
+
+using RegisterDescs = MemoryDescs
+
+
+
+
+using SyncMessage = std :: string
+
+
+
+
+using ConnectionInfoType = std :: string
+
+
+
+
+
Enums
+
+
+enum class MemoryType : uint8_t
+Values:
+
+
+enumerator kDRAM
+
+
+
+
+enumerator kVRAM
+
+
+
+
+enumerator kBLK
+
+
+
+
+enumerator kOBJ
+
+
+
+
+enumerator kFILE
+
+
+
+
+
+
+enum class TransferOp : uint8_t
+Values:
+
+
+enumerator kREAD
+
+
+
+
+enumerator kWRITE
+
+
+
+
+
+
+
Functions
+
+
+template < typename ... Args > std :: unique_ptr < BaseTransferAgent > makeTransferAgent (
+
+
+std :: string const & backend ,
+Args & & ... args ,
+
+
+)
+
+
+
+
+
+class AgentDesc
+
+
Public Functions
+
+
+inline AgentDesc ( std :: string backendAgentDesc )
+
+
+
+
+inline std :: string const & getBackendAgentDesc ( ) const noexcept
+
+
+
+
+
Private Members
+
+
+std :: string mBackendAgentDesc
+
+
+
+
+
+
+
+struct BaseAgentConfig
+
+
Public Members
+
+
+std :: string mName
+
+
+
+
+bool useProgThread
+
+
+
+
+
+
+
+class BaseTransferAgent
+
+
Public Functions
+
+
+virtual ~BaseTransferAgent ( ) = default
+
+
+
+
+virtual void registerMemory ( RegisterDescs const & descs ) = 0
+
+
+
+
+virtual void deregisterMemory ( RegisterDescs const & descs ) = 0
+
+
+
+
+virtual void loadRemoteAgent (
+
+
+std :: string const & name ,
+AgentDesc const & agentDesc ,
+
+
+) = 0
+
+
+
+
+virtual AgentDesc getLocalAgentDesc ( ) = 0
+
+
+
+
+virtual void invalidateRemoteAgent ( std :: string const & name ) = 0
+
+
+
+
+virtual std :: unique_ptr < TransferStatus > submitTransferRequests (
+
+
+TransferRequest const & request ,
+
+
+) = 0
+
+
+
+
+virtual void notifySyncMessage (
+
+
+std :: string const & name ,
+SyncMessage const & syncMessage ,
+
+
+) = 0
+
+
+
+
+virtual std :: unordered_map < std :: string , std :: vector < SyncMessage > > getNotifiedSyncMessages (
+
+
+
+
+) = 0
+
+
+
+
+virtual ConnectionInfoType getConnectionInfo ( ) = 0
+
+
+
+
+virtual void connectRemoteAgent (
+
+
+std :: string const & name ,
+ConnectionInfoType const & connectionInfo ,
+
+
+) = 0
+
+
+
+
+virtual bool checkRemoteDescs (
+
+
+std :: string const & name ,
+MemoryDescs const & memoryDescs ,
+
+
+) = 0
+
+
+
+
+
+
+
+class DynLibLoader
+
+
Public Functions
+
+
+void * getHandle ( std :: string const & name )
+
+
+
+
+template < typename FunctionT > inline FunctionT getFunctionPointer (
+
+
+std :: string const & libName ,
+std :: string const & funcName ,
+
+
+)
+
+
+
+
+~DynLibLoader ( )
+
+
+
+
+DynLibLoader ( ) = default
+
+
+
+
+DynLibLoader ( DynLibLoader const & ) = delete
+
+
+
+
+DynLibLoader & operator = ( DynLibLoader const & ) = delete
+
+
+
+
+
Public Static Functions
+
+
+static DynLibLoader & getInstance ( )
+
+
+
+
+
Private Members
+
+
+std :: mutex mDllMutex
+
+
+
+
+std :: unordered_map < std :: string , void * > mHandlers
+
+
+
+
+
Private Static Functions
+
+
+static void * dlSym ( void * handle , char const * symbol )
+
+
+
+
+
+
+
+class MemoryDesc
+
+
Public Functions
+
+
+inline MemoryDesc (
+
+
+std :: vector < char > const & vec ,
+uint32_t deviceId = 0 ,
+
+
+)
+
+
+
+
+inline MemoryDesc ( void * addr , size_t len , uint32_t deviceId )
+
+
+
+
+inline MemoryDesc ( uintptr_t addr , size_t len , uint32_t deviceId )
+
+
+
+
+inline uintptr_t getAddr ( ) const noexcept
+
+
+
+
+inline size_t getLen ( ) const noexcept
+
+
+
+
+inline uint32_t getDeviceId ( ) const noexcept
+
+
+
+
+
Public Static Functions
+
+
+static void serialize ( MemoryDesc const & memoryDesc , std :: ostream & os )
+
+
+
+
+static MemoryDesc deserialize ( std :: istream & is )
+
+
+
+
+static size_t serializedSize ( MemoryDesc const & memoryDesc )
+
+
+
+
+
Private Members
+
+
+uintptr_t mAddr
+
+
+
+
+size_t mLen
+
+
+
+
+uint32_t mDeviceId
+
+
+
+
+
+
+
+class MemoryDescs
+
+
Public Functions
+
+
+inline MemoryDescs ( MemoryType type , std :: vector < MemoryDesc > descs )
+
+
+
+
+inline MemoryType getType ( ) const noexcept
+
+
+
+
+inline std :: vector < MemoryDesc > const & getDescs ( ) const noexcept
+
+
+
+
+
+
+
+
+class TransferRequest
+
+
Public Functions
+
+
+inline TransferRequest (
+
+
+TransferOp op ,
+TransferDescs srcDescs ,
+TransferDescs dstDescs ,
+std :: string const & remoteName ,
+std :: optional < SyncMessage > syncMessage = std :: nullopt ,
+
+
+)
+
+
+
+
+inline TransferOp getOp ( ) const noexcept
+
+
+
+
+inline TransferDescs const & getSrcDescs ( ) const noexcept
+
+
+
+
+inline TransferDescs const & getDstDescs ( ) const noexcept
+
+
+
+
+inline std :: string const & getRemoteName ( ) const noexcept
+
+
+
+
+inline std :: optional < SyncMessage > getSyncMessage ( ) const noexcept
+
+
+
+
+
+
+
+
+class TransferStatus
+
+
Public Functions
+
+
+virtual ~TransferStatus ( ) = default
+
+
+
+
+virtual bool isCompleted ( ) const = 0
+
+
+
+
+virtual void wait ( ) const = 0
+
+
+
+
+
+
+
+
+
+
+
+
+
+serialization.h
+
+
+namespace tensorrt_llm
+
+
+namespace executor
+
+
+class Serialization
+
+
Public Static Functions
+
+
+static RequestPerfMetrics :: TimePoint deserializeTimePoint (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+RequestPerfMetrics :: TimePoint const & tp ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( RequestPerfMetrics :: TimePoint const & )
+
+
+
+
+static RequestPerfMetrics deserializeRequestPerfMetrics (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+RequestPerfMetrics const & metrics ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( RequestPerfMetrics const & metrics )
+
+
+
+
+static SamplingConfig deserializeSamplingConfig ( std :: istream & is )
+
+
+
+
+static void serialize ( SamplingConfig const & config , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( SamplingConfig const & config )
+
+
+
+
+static OutputConfig deserializeOutputConfig ( std :: istream & is )
+
+
+
+
+static void serialize ( OutputConfig const & config , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( OutputConfig const & config )
+
+
+
+
+static AdditionalModelOutput deserializeAdditionalModelOutput (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+AdditionalModelOutput const & additionalModelOutput ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+AdditionalModelOutput const & additionalModelOutput ,
+
+
+)
+
+
+
+
+static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+ExternalDraftTokensConfig const & config ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( ExternalDraftTokensConfig const & config )
+
+
+
+
+static PromptTuningConfig deserializePromptTuningConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+PromptTuningConfig const & config ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( PromptTuningConfig const & config )
+
+
+
+
+static MultimodalInput deserializeMultimodalInput ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+MultimodalInput const & multimodalInput ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( MultimodalInput const & multimodalInput )
+
+
+
+
+static MropeConfig deserializeMropeConfig ( std :: istream & is )
+
+
+
+
+static void serialize ( MropeConfig const & config , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( MropeConfig const & config )
+
+
+
+
+static LoraConfig deserializeLoraConfig ( std :: istream & is )
+
+
+
+
+static void serialize ( LoraConfig const & config , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( LoraConfig const & config )
+
+
+
+
+static kv_cache :: CommState deserializeCommState ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+kv_cache :: CommState const & state ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( kv_cache :: CommState const & state )
+
+
+
+
+static kv_cache :: SocketState deserializeSocketState ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+kv_cache :: SocketState const & state ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( kv_cache :: SocketState const & state )
+
+
+
+
+static kv_cache :: AgentState deserializeAgentState ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+kv_cache :: AgentState const & state ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( kv_cache :: AgentState const & state )
+
+
+
+
+static kv_cache :: CacheState deserializeCacheState ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+kv_cache :: CacheState const & state ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( kv_cache :: CacheState const & state )
+
+
+
+
+static DataTransceiverState deserializeDataTransceiverState (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static DataTransceiverState deserializeDataTransceiverState (
+
+
+std :: vector < char > & buffer ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+DataTransceiverState const & dataTransceiverState ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static std :: vector < char > serialize (
+
+
+DataTransceiverState const & dataTransceiverState ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+DataTransceiverState const & dataTransceiverState ,
+
+
+)
+
+
+
+
+static ContextPhaseParams deserializeContextPhaseParams (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+ContextPhaseParams const & contextPhaseParams ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+ContextPhaseParams const & contextPhaseParams ,
+
+
+)
+
+
+
+
+static Request deserializeRequest ( std :: istream & is )
+
+
+
+
+static void serialize ( Request const & request , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( Request const & request )
+
+
+
+
+static Tensor deserializeTensor ( std :: istream & is )
+
+
+
+
+static void serialize ( Tensor const & tensor , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( Tensor const & tensor )
+
+
+
+
+static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+SpeculativeDecodingFastLogitsInfo const & info ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+SpeculativeDecodingFastLogitsInfo const & info ,
+
+
+)
+
+
+
+
+static Result deserializeResult ( std :: istream & is )
+
+
+
+
+static void serialize ( Result const & result , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( Result const & result )
+
+
+
+
+static AdditionalOutput deserializeAdditionalOutput ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+AdditionalOutput const & additionalOutput ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+AdditionalOutput const & additionalOutput ,
+
+
+)
+
+
+
+
+static Response deserializeResponse ( std :: istream & is )
+
+
+
+
+static void serialize ( Response const & response , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( Response const & response )
+
+
+
+
+static std :: vector < Response > deserializeResponses (
+
+
+std :: vector < char > & buffer ,
+
+
+)
+
+
+
+
+static std :: vector < char > serialize (
+
+
+std :: vector < Response > const & responses ,
+
+
+)
+
+
+
+
+static KvCacheConfig deserializeKvCacheConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+KvCacheConfig const & kvCacheConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( KvCacheConfig const & kvCacheConfig )
+
+
+
+
+static DynamicBatchConfig deserializeDynamicBatchConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+DynamicBatchConfig const & dynamicBatchConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+DynamicBatchConfig const & dynamicBatchConfig ,
+
+
+)
+
+
+
+
+static SchedulerConfig deserializeSchedulerConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+SchedulerConfig const & schedulerConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( SchedulerConfig const & schedulerConfig )
+
+
+
+
+static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+ExtendedRuntimePerfKnobConfig const & extendedRuntimePerfKnobConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+ExtendedRuntimePerfKnobConfig const & extendedRuntimePerfKnobConfig ,
+
+
+)
+
+
+
+
+static ParallelConfig deserializeParallelConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+ParallelConfig const & parallelConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( ParallelConfig const & parallelConfig )
+
+
+
+
+static PeftCacheConfig deserializePeftCacheConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+PeftCacheConfig const & peftCacheConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( PeftCacheConfig const & peftCacheConfig )
+
+
+
+
+static OrchestratorConfig deserializeOrchestratorConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+OrchestratorConfig const & orchestratorConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+OrchestratorConfig const & orchestratorConfig ,
+
+
+)
+
+
+
+
+static DecodingMode deserializeDecodingMode ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+DecodingMode const & decodingMode ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( DecodingMode const & decodingMode )
+
+
+
+
+static LookaheadDecodingConfig deserializeLookaheadDecodingConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+LookaheadDecodingConfig const & lookaheadDecodingConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+LookaheadDecodingConfig const & lookaheadDecodingConfig ,
+
+
+)
+
+
+
+
+static EagleConfig deserializeEagleConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+EagleConfig const & eagleConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( EagleConfig const & eagleConfig )
+
+
+
+
+static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+SpeculativeDecodingConfig const & specDecConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+SpeculativeDecodingConfig const & specDecConfig ,
+
+
+)
+
+
+
+
+static GuidedDecodingConfig deserializeGuidedDecodingConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+GuidedDecodingConfig const & guidedDecodingConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+GuidedDecodingConfig const & guidedDecodingConfig ,
+
+
+)
+
+
+
+
+static GuidedDecodingParams deserializeGuidedDecodingParams (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+GuidedDecodingParams const & guidedDecodingParams ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+GuidedDecodingParams const & guidedDecodingParams ,
+
+
+)
+
+
+
+
+static KvCacheRetentionConfig deserializeKvCacheRetentionConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+KvCacheRetentionConfig const & kvCacheRetentionConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+KvCacheRetentionConfig const & kvCacheRetentionConfig ,
+
+
+)
+
+
+
+
+static KvCacheRetentionConfig :: TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+KvCacheRetentionConfig :: TokenRangeRetentionConfig const & tokenRangeRetentionConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+KvCacheRetentionConfig :: TokenRangeRetentionConfig const & tokenRangeRetentionConfig ,
+
+
+)
+
+
+
+
+static DecodingConfig deserializeDecodingConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+DecodingConfig const & decodingConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( DecodingConfig const & decodingConfig )
+
+
+
+
+static DebugConfig deserializeDebugConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+DebugConfig const & debugConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( DebugConfig const & debugConfig )
+
+
+
+
+static CacheTransceiverConfig deserializeCacheTransceiverConfig (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+CacheTransceiverConfig const & cacheTransceiverConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+CacheTransceiverConfig const & cacheTransceiverConfig ,
+
+
+)
+
+
+
+
+static ExecutorConfig deserializeExecutorConfig ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+ExecutorConfig const & executorConfig ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( ExecutorConfig const & executorConfig )
+
+
+
+
+static KvCacheStats deserializeKvCacheStats ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+KvCacheStats const & kvCacheStats ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( KvCacheStats const & kvCacheStats )
+
+
+
+
+static StaticBatchingStats deserializeStaticBatchingStats (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+StaticBatchingStats const & staticBatchingStats ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+StaticBatchingStats const & staticBatchingStats ,
+
+
+)
+
+
+
+
+static InflightBatchingStats deserializeInflightBatchingStats (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+InflightBatchingStats const & inflightBatchingStats ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+InflightBatchingStats const & inflightBatchingStats ,
+
+
+)
+
+
+
+
+static SpecDecodingStats deserializeSpecDecodingStats (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+SpecDecodingStats const & specDecodingStats ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+SpecDecodingStats const & specDecodingStats ,
+
+
+)
+
+
+
+
+static IterationStats deserializeIterationStats (
+
+
+std :: vector < char > & buffer ,
+
+
+)
+
+
+
+
+static IterationStats deserializeIterationStats ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+IterationStats const & iterStats ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static std :: vector < char > serialize ( IterationStats const & iterStats )
+
+
+
+
+static size_t serializedSize ( IterationStats const & iterStats )
+
+
+
+
+static std :: vector < char > serialize (
+
+
+std :: vector < IterationStats > const & iterStatsVec ,
+
+
+)
+
+
+
+
+static std :: vector < IterationStats > deserializeIterationStatsVec (
+
+
+std :: vector < char > & buffer ,
+
+
+)
+
+
+
+
+static DisServingRequestStats deserializeDisServingRequestStats (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+DisServingRequestStats const & stats ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize (
+
+
+DisServingRequestStats const & disServingRequestStats ,
+
+
+)
+
+
+
+
+static RequestStage deserializeRequestStage ( std :: istream & is )
+
+
+
+
+static void serialize (
+
+
+RequestStage const & requestStage ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( RequestStage const & requestStage )
+
+
+
+
+static RequestStats deserializeRequestStats ( std :: istream & is )
+
+
+
+
+static void serialize ( RequestStats const & state , std :: ostream & os )
+
+
+
+
+static size_t serializedSize ( RequestStats const & state )
+
+
+
+
+static RequestStatsPerIteration deserializeRequestStatsPerIteration (
+
+
+std :: istream & is ,
+
+
+)
+
+
+
+
+static RequestStatsPerIteration deserializeRequestStatsPerIteration (
+
+
+std :: vector < char > & buffer ,
+
+
+)
+
+
+
+
+static void serialize (
+
+
+RequestStatsPerIteration const & state ,
+std :: ostream & os ,
+
+
+)
+
+
+
+
+static std :: vector < char > serialize (
+
+
+RequestStatsPerIteration const & state ,
+
+
+)
+
+
+
+
+static size_t serializedSize ( RequestStatsPerIteration const & state )
+
+
+
+
+static std :: vector < char > serialize (
+
+
+std :: vector < RequestStatsPerIteration > const & requestStatsVec ,
+
+
+)
+
+
+
+
+static std :: vector < RequestStatsPerIteration > deserializeRequestStatsPerIterationVec (
+
+
+std :: vector < char > & buffer ,
+
+
+)
+
+
+
+
+static std :: string deserializeString ( std :: istream & is )
+
+
+
+
+static bool deserializeBool ( std :: istream & is )
+
+
+
+
+static ModelType deserializeModelType ( std :: istream & is )
+
+
+
+
+
+
+
+namespace kv_cache
+
+
+
+
+
+
types.h
@@ -2723,849 +4992,6 @@
-
-
-disaggServerUtil.h
-
-
-namespace tensorrt_llm
-
-
-namespace executor
-
-
-namespace disagg_executor
-
-
-class DisaggExecutorOrchestrator
-
-
Public Functions
-
-
-DisaggExecutorOrchestrator (
-
-
-std :: vector < std :: filesystem :: path > const & ctxEnginePaths ,
-std :: vector < std :: filesystem :: path > const & genEnginePaths ,
-std :: vector < executor :: ExecutorConfig > const & ctxExecutorConfigs ,
-std :: vector < executor :: ExecutorConfig > const & genExecutorConfigs ,
-bool hasContextAwaitThreads ,
-bool hasGenAwaitThreads ,
-
-
-)
-Constructs a DisaggExecutorOrchestrator object.
-
-Parameters:
-
-ctxEnginePaths – A vector of file paths to context engine files.
-genEnginePaths – A vector of file paths to generation engine files.
-ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
-genExecutorConfigs – A vector of ExecutorConfig for generation executors.
-hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
-hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
-
-
-
-
-
-
-
-std :: vector < IdType > enqueueContext (
-
-
-std :: vector < texec :: Request > const & requests ,
-std :: optional < int > selectContextId = std :: nullopt ,
-bool batch = false ,
-
-
-)
-Enqueue context-only requests to context executors.
-
-Parameters:
-
-requests – A vector of context-only requests.
-selectContextId – The index of the context executor to use. If std::nullopt , the executor that has the smallest number of inflight requests will be used.
-batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
-
-
-Returns:
-A vector of global request ids, corresponding to the order of the requests in requests , the id returned may be different from the request id in each executor.
-
-
-
-
-
-
-void enqueueGeneration (
-
-
-std :: vector < texec :: Request > const & requests ,
-std :: vector < IdType > const & globalRequestIds ,
-std :: optional < int > selectGenIdx = std :: nullopt ,
-bool batch = false ,
-
-
-)
-Enqueue generation-only requests to generation executors.
-
-Parameters:
-
-requests – A vector of generation-only requests.
-globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
-selectGenIdx – The index of the generation executor to use. If std::nullopt , the executor that has the smallest number of inflight requests will be used.
-batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
-
-
-
-
-
-
-
-std :: vector < ResponseWithId > awaitContextResponses (
-
-
-std :: optional < std :: chrono :: milliseconds > const & timeout ,
-std :: optional < int > contextIdx = std :: nullopt ,
-
-
-)
-Await for context responses.
-
-Parameters:
-
-timeout – The maximum time to wait for new responses
-contextIdx – The index of the context executor to use. If std::nullopt , return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
-
-
-Returns:
-A vector of responses with corresponding global request ids
-
-
-
-
-
-
-std :: vector < ResponseWithId > awaitGenerationResponses (
-
-
-std :: optional < std :: chrono :: milliseconds > const & timeout ,
-std :: optional < int > genIdx = std :: nullopt ,
-
-
-)
-Await for generation responses.
-
-Parameters:
-
-timeout – The maximum time to wait for new responses.
-genIdx – The index of the generation executor to use. If std::nullopt , return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
-
-
-Returns:
-A vector of responses with corresponding global request ids.
-
-
-
-
-
-
-bool canEnqueue ( ) const
-Indicates if the current process is allowed to enqueueRequests.
-
-
-
-
-std :: vector < std :: unique_ptr < texec :: Executor > > const & getContextExecutors (
-
-
-
-
-) const
-Get context executors.
-
-
-
-
-std :: vector < std :: unique_ptr < texec :: Executor > > const & getGenExecutors (
-
-
-
-
-) const
-Get generation executors.
-
-
-
-
-~DisaggExecutorOrchestrator ( )
-
-
-
-
-
Private Members
-
-
-std :: unique_ptr < Impl > mImpl
-
-
-
-
-
-
-
-struct ResponseWithId
-
-
-
-
-
-
-
-
-
-
-
-
-dataTransceiverState.h
-
-
-namespace tensorrt_llm
-
-
-namespace executor
-
-
-class DataTransceiverState
-
-
Public Functions
-
-
-DataTransceiverState ( ) = default
-
-
-
-
-inline DataTransceiverState (
-
-
-kv_cache :: CacheState cacheState ,
-kv_cache :: CommState commState ,
-
-
-)
-
-
-
-
-inline void setCacheState ( kv_cache :: CacheState state )
-
-
-
-
-inline std :: optional < kv_cache :: CacheState > const & getCacheState (
-
-
-
-
-) const noexcept
-
-
-
-
-inline void setCommState ( kv_cache :: CommState state )
-
-
-
-
-inline std :: optional < kv_cache :: CommState > const & getCommState (
-
-
-
-
-) const noexcept
-
-
-
-
-inline bool operator == (
-
-
-DataTransceiverState const & other ,
-
-
-) const noexcept
-
-
-
-
-inline std :: string toString ( ) const
-
-
-
-
-
-
Friends
-
-
-friend class Serialization
-
-
-
-
-
-
-
-namespace kv_cache
-
-
-struct AgentState
-
-
Public Functions
-
-
-inline AgentState ( std :: string agentName , std :: string connectionInfo )
-
-
-
-
-AgentState ( ) = default
-
-
-
-
-inline bool operator == ( AgentState const & other ) const noexcept
-
-
-
-
-inline std :: string toString ( ) const
-
-
-
-
-
Public Members
-
-
-std :: string mAgentName
-
-
-
-
-std :: string mConnectionInfo
-
-
-
-
-
-
-
-class CacheState
-
-
Public Types
-
-
-enum class AttentionType : std :: uint8_t
-Values:
-
-
-enumerator kDEFAULT
-
-
-
-
-enumerator kMLA
-
-
-
-
-
-
-
Public Functions
-
-
-inline CacheState (
-
-
-ModelConfig modelConfig ,
-runtime :: WorldConfig const & worldConfig ,
-nvinfer1 :: DataType dataType ,
-AttentionType attentionType = AttentionType :: kDEFAULT ,
-int kvFactor = 2 ,
-
-
-)
-
-
-
-
-inline CacheState (
-
-
-std :: vector < SizeType32 > nbKvHeadPerLayer ,
-SizeType32 sizePerHead ,
-SizeType32 tokensPerBlock ,
-SizeType32 tensorParallelism ,
-SizeType32 pipelineParallelism ,
-nvinfer1 :: DataType dataType ,
-AttentionType attentionType = AttentionType :: kDEFAULT ,
-int kvFactor = 2 ,
-bool enableAttentionDP = false ,
-int DPrank = 0 ,
-int DPsize = 0 ,
-
-
-)
-
-
-
-
-inline CacheState (
-
-
-SizeType32 nbAttentionLayers ,
-SizeType32 nbKvHeads ,
-SizeType32 sizePerHead ,
-SizeType32 tokensPerBlock ,
-SizeType32 tensorParallelism ,
-SizeType32 pipelineParallelism ,
-nvinfer1 :: DataType dataType ,
-AttentionType attentionType = AttentionType :: kDEFAULT ,
-int kvFactor = 2 ,
-bool enableAttentionDP = false ,
-int DPrank = 0 ,
-int DPsize = 0 ,
-
-
-)
-
-
-
-
-inline bool operator == (
-
-
-kv_cache :: CacheState const & other ,
-
-
-) const noexcept
-
-
-
-
-inline ModelConfig const & getModelConfig ( ) const
-
-
-
-
-inline ParallelConfig const & getParallelConfig ( ) const
-
-
-
-
-inline AttentionConfig const & getAttentionConfig ( ) const
-
-
-
-
-inline nvinfer1 :: DataType const & getDataType ( ) const
-
-
-
-
-inline std :: string toString ( ) const
-
-
-
-
-
-
Friends
-
-
-friend class tensorrt_llm::executor::Serialization
-
-
-
-
-
-struct AttentionConfig
-
-
Public Functions
-
-
-inline AttentionConfig ( AttentionType attentionType , int kvFactor )
-
-
-
-
-
Public Members
-
-
-AttentionType mAttentionType
-
-
-
-
-int mKvFactor
-
-
-
-
-
-
-
-struct ModelConfig
-
-
Public Functions
-
-
-inline bool operator == ( ModelConfig const & other ) const noexcept
-
-
-
-
-
-
-
-
-struct ParallelConfig
-
-
Public Functions
-
-
-inline bool operator == ( ParallelConfig const & other ) const noexcept
-
-
-
-
-
-
-
-
-
-
-class CommState
-
-
Public Functions
-
-
-CommState ( ) = default
-
-
-
-
-inline explicit CommState (
-
-
-std :: vector < SizeType32 > ranks ,
-int selfIdx = - 1 ,
-
-
-)
-
-
-
-
-inline explicit CommState (
-
-
-std :: vector < SocketState > socketState ,
-int selfIdx = - 1 ,
-
-
-)
-
-
-
-
-inline CommState ( std :: uint16_t port , std :: string ip )
-
-
-
-
-inline explicit CommState (
-
-
-std :: vector < AgentState > agentState ,
-int selfIdx = - 1 ,
-
-
-)
-
-
-
-
-inline bool isMpiState ( ) const noexcept
-
-
-
-
-inline bool isSocketState ( ) const noexcept
-
-
-
-
-inline bool isAgentState ( ) const noexcept
-
-
-
-
-inline MpiState const & getMpiState ( ) const
-
-
-
-
-inline std :: vector < SocketState > const & getSocketState ( ) const
-
-
-
-
-inline std :: vector < AgentState > const & getAgentState ( ) const
-
-
-
-
-inline int getSelfIdx ( ) const noexcept
-
-
-
-
-inline bool operator == ( CommState const & other ) const noexcept
-
-
-
-
-inline std :: string toString ( ) const
-
-
-
-
-
Private Members
-
-
-std :: variant < std :: monostate , MpiState , std :: vector < SocketState > , std :: vector < AgentState > > mState
-
-
-
-
-int mSelfIdx = { - 1 }
-
-
-
-
-
Friends
-
-
-friend class tensorrt_llm::executor::Serialization
-
-
-
-
-
-
-
-struct MpiState
-
-
Public Functions
-
-
-inline bool operator == ( MpiState const & other ) const noexcept
-
-
-
-
-inline std :: string toString ( ) const
-
-
-
-
-
Public Members
-
-
-std :: vector < SizeType32 > mRanks
-
-
-
-
-
-
-
-struct SocketState
-
-
Public Functions
-
-
-inline bool operator == ( SocketState const & other ) const noexcept
-
-
-
-
-inline std :: string toString ( ) const
-
-
-
-
-
Public Members
-
-
-std :: uint16_t mPort
-
-
-
-
-std :: string mIp
-
-
-
-
-
-
-
-
-
-
-
executor.h
@@ -9113,8 +10539,8 @@
-
-serialization.h
+
+dataTransceiverState.h
namespace tensorrt_llm
@@ -9122,1448 +10548,570 @@
namespace executor
-
-class Serialization
+
+class DataTransceiverState
-
Public Static Functions
+
Public Functions
-
-static RequestPerfMetrics :: TimePoint deserializeTimePoint (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-RequestPerfMetrics :: TimePoint const & tp ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( RequestPerfMetrics :: TimePoint const & )
-
-
-
-
-static RequestPerfMetrics deserializeRequestPerfMetrics (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-RequestPerfMetrics const & metrics ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( RequestPerfMetrics const & metrics )
-
-
-
-
-static SamplingConfig deserializeSamplingConfig ( std :: istream & is )
-
-
-
-
-static void serialize ( SamplingConfig const & config , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( SamplingConfig const & config )
-
-
-
-
-static OutputConfig deserializeOutputConfig ( std :: istream & is )
-
-
-
-
-static void serialize ( OutputConfig const & config , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( OutputConfig const & config )
-
-
-
-
-static AdditionalModelOutput deserializeAdditionalModelOutput (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-AdditionalModelOutput const & additionalModelOutput ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-AdditionalModelOutput const & additionalModelOutput ,
-
-
-)
-
-
-
-
-static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-ExternalDraftTokensConfig const & config ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( ExternalDraftTokensConfig const & config )
-
-
-
-
-static PromptTuningConfig deserializePromptTuningConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-PromptTuningConfig const & config ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( PromptTuningConfig const & config )
-
-
-
-
-static MultimodalInput deserializeMultimodalInput ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-MultimodalInput const & multimodalInput ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( MultimodalInput const & multimodalInput )
-
-
-
-
-static MropeConfig deserializeMropeConfig ( std :: istream & is )
-
-
-
-
-static void serialize ( MropeConfig const & config , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( MropeConfig const & config )
-
-
-
-
-static LoraConfig deserializeLoraConfig ( std :: istream & is )
-
-
-
-
-static void serialize ( LoraConfig const & config , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( LoraConfig const & config )
-
-
-
-
-static kv_cache :: CommState deserializeCommState ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-kv_cache :: CommState const & state ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( kv_cache :: CommState const & state )
-
-
-
-
-static kv_cache :: SocketState deserializeSocketState ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-kv_cache :: SocketState const & state ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( kv_cache :: SocketState const & state )
-
-
-
-
-static kv_cache :: AgentState deserializeAgentState ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-kv_cache :: AgentState const & state ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( kv_cache :: AgentState const & state )
-
-
-
-
-static kv_cache :: CacheState deserializeCacheState ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-kv_cache :: CacheState const & state ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( kv_cache :: CacheState const & state )
-
-
-
-
-static DataTransceiverState deserializeDataTransceiverState (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static DataTransceiverState deserializeDataTransceiverState (
-
-
-std :: vector < char > & buffer ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-DataTransceiverState const & dataTransceiverState ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static std :: vector < char > serialize (
-
-
-DataTransceiverState const & dataTransceiverState ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-DataTransceiverState const & dataTransceiverState ,
-
-
-)
-
-
-
-
-static ContextPhaseParams deserializeContextPhaseParams (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-ContextPhaseParams const & contextPhaseParams ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-ContextPhaseParams const & contextPhaseParams ,
-
-
-)
-
-
-
-
-static Request deserializeRequest ( std :: istream & is )
-
-
-
-
-static void serialize ( Request const & request , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( Request const & request )
-
-
-
-
-static Tensor deserializeTensor ( std :: istream & is )
-
-
-
-
-static void serialize ( Tensor const & tensor , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( Tensor const & tensor )
-
-
-
-
-static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-SpeculativeDecodingFastLogitsInfo const & info ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-SpeculativeDecodingFastLogitsInfo const & info ,
-
-
-)
-
-
-
-
-static Result deserializeResult ( std :: istream & is )
-
-
-
-
-static void serialize ( Result const & result , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( Result const & result )
-
-
-
-
-static AdditionalOutput deserializeAdditionalOutput ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-AdditionalOutput const & additionalOutput ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-AdditionalOutput const & additionalOutput ,
-
-
-)
-
-
-
-
-static Response deserializeResponse ( std :: istream & is )
-
-
-
-
-static void serialize ( Response const & response , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( Response const & response )
-
-
-
-
-static std :: vector < Response > deserializeResponses (
-
-
-std :: vector < char > & buffer ,
-
-
-)
-
-
-
-
-static std :: vector < char > serialize (
-
-
-std :: vector < Response > const & responses ,
-
-
-)
-
-
-
-
-static KvCacheConfig deserializeKvCacheConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-KvCacheConfig const & kvCacheConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( KvCacheConfig const & kvCacheConfig )
-
-
-
-
-static DynamicBatchConfig deserializeDynamicBatchConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-DynamicBatchConfig const & dynamicBatchConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-DynamicBatchConfig const & dynamicBatchConfig ,
-
-
-)
-
-
-
-
-static SchedulerConfig deserializeSchedulerConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-SchedulerConfig const & schedulerConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( SchedulerConfig const & schedulerConfig )
-
-
-
-
-static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-ExtendedRuntimePerfKnobConfig const & extendedRuntimePerfKnobConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-ExtendedRuntimePerfKnobConfig const & extendedRuntimePerfKnobConfig ,
-
-
-)
-
-
-
-
-static ParallelConfig deserializeParallelConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-ParallelConfig const & parallelConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( ParallelConfig const & parallelConfig )
-
-
-
-
-static PeftCacheConfig deserializePeftCacheConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-PeftCacheConfig const & peftCacheConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( PeftCacheConfig const & peftCacheConfig )
-
-
-
-
-static OrchestratorConfig deserializeOrchestratorConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-OrchestratorConfig const & orchestratorConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-OrchestratorConfig const & orchestratorConfig ,
-
-
-)
-
-
-
-
-static DecodingMode deserializeDecodingMode ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-DecodingMode const & decodingMode ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( DecodingMode const & decodingMode )
-
-
-
-
-static LookaheadDecodingConfig deserializeLookaheadDecodingConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-LookaheadDecodingConfig const & lookaheadDecodingConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-LookaheadDecodingConfig const & lookaheadDecodingConfig ,
-
-
-)
-
-
-
-
-static EagleConfig deserializeEagleConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-EagleConfig const & eagleConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( EagleConfig const & eagleConfig )
-
-
-
-
-static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-SpeculativeDecodingConfig const & specDecConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-SpeculativeDecodingConfig const & specDecConfig ,
-
-
-)
-
-
-
-
-static GuidedDecodingConfig deserializeGuidedDecodingConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-GuidedDecodingConfig const & guidedDecodingConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-GuidedDecodingConfig const & guidedDecodingConfig ,
-
-
-)
-
-
-
-
-static GuidedDecodingParams deserializeGuidedDecodingParams (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-GuidedDecodingParams const & guidedDecodingParams ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-GuidedDecodingParams const & guidedDecodingParams ,
-
-
-)
-
-
-
-
-static KvCacheRetentionConfig deserializeKvCacheRetentionConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-KvCacheRetentionConfig const & kvCacheRetentionConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-KvCacheRetentionConfig const & kvCacheRetentionConfig ,
-
-
-)
-
-
-
-
-static KvCacheRetentionConfig :: TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-KvCacheRetentionConfig :: TokenRangeRetentionConfig const & tokenRangeRetentionConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-KvCacheRetentionConfig :: TokenRangeRetentionConfig const & tokenRangeRetentionConfig ,
-
-
-)
-
-
-
-
-static DecodingConfig deserializeDecodingConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-DecodingConfig const & decodingConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( DecodingConfig const & decodingConfig )
-
-
-
-
-static DebugConfig deserializeDebugConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-DebugConfig const & debugConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( DebugConfig const & debugConfig )
-
-
-
-
-static CacheTransceiverConfig deserializeCacheTransceiverConfig (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-CacheTransceiverConfig const & cacheTransceiverConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-CacheTransceiverConfig const & cacheTransceiverConfig ,
-
-
-)
-
-
-
-
-static ExecutorConfig deserializeExecutorConfig ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-ExecutorConfig const & executorConfig ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( ExecutorConfig const & executorConfig )
-
-
-
-
-static KvCacheStats deserializeKvCacheStats ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-KvCacheStats const & kvCacheStats ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize ( KvCacheStats const & kvCacheStats )
-
-
-
-
-static StaticBatchingStats deserializeStaticBatchingStats (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-StaticBatchingStats const & staticBatchingStats ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-StaticBatchingStats const & staticBatchingStats ,
-
-
-)
-
-
-
-
-static InflightBatchingStats deserializeInflightBatchingStats (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-InflightBatchingStats const & inflightBatchingStats ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-InflightBatchingStats const & inflightBatchingStats ,
-
-
-)
-
-
-
-
-static SpecDecodingStats deserializeSpecDecodingStats (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-SpecDecodingStats const & specDecodingStats ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static size_t serializedSize (
-
-
-SpecDecodingStats const & specDecodingStats ,
-
-
-)
-
-
-
-
-static IterationStats deserializeIterationStats (
-
-
-std :: vector < char > & buffer ,
-
-
-)
-
-
-
-
-static IterationStats deserializeIterationStats ( std :: istream & is )
-
-
-
-
-static void serialize (
-
-
-IterationStats const & iterStats ,
-std :: ostream & os ,
-
-
-)
-
-
-
-
-static std :: vector < char > serialize ( IterationStats const & iterStats )
-
-
-
-
-static size_t serializedSize ( IterationStats const & iterStats )
-
-
-
-
-static std :: vector < char > serialize (
-
-
-std :: vector < IterationStats > const & iterStatsVec ,
-
-
-)
-
-
-
-
-static std :: vector < IterationStats > deserializeIterationStatsVec (
-
-
-std :: vector < char > & buffer ,
-
-
-)
-
-
-
-
-static DisServingRequestStats deserializeDisServingRequestStats (
-
-
-std :: istream & is ,
-
-
-)
-
-
-
-
-static void serialize (
-
-
-DisServingRequestStats const & stats ,
-std :: ostream & os ,
-
-
-)
+
+DataTransceiverState ( ) = default
-
-static size_t serializedSize (
+
+inline DataTransceiverState (
-DisServingRequestStats const & disServingRequestStats ,
+kv_cache :: CacheState cacheState ,
+kv_cache :: CommState commState ,
-)
+)
-
-static RequestStage deserializeRequestStage ( std :: istream & is )
+
+inline void setCacheState ( kv_cache :: CacheState state )
-
-static void serialize (
+
+inline std :: optional < kv_cache :: CacheState > const & getCacheState (
-RequestStage const & requestStage ,
-std :: ostream & os ,
-)
-
-
-
-
-static size_t serializedSize ( RequestStage const & requestStage )
-
-
-
-
-static RequestStats deserializeRequestStats ( std :: istream & is )
-
-
-
-
-static void serialize ( RequestStats const & state , std :: ostream & os )
-
-
-
-
-static size_t serializedSize ( RequestStats const & state )
+) const noexcept
-
-static RequestStatsPerIteration deserializeRequestStatsPerIteration (
-
-
-std :: istream & is ,
-
-
-)
+
+inline void setCommState ( kv_cache :: CommState state )
-
-static RequestStatsPerIteration deserializeRequestStatsPerIteration (
+
+inline std :: optional < kv_cache :: CommState > const & getCommState (
-std :: vector < char > & buffer ,
-)
+) const noexcept
-
-static void serialize (
+
+inline bool operator == (
-RequestStatsPerIteration const & state ,
-std :: ostream & os ,
+DataTransceiverState const & other ,
-)
+) const noexcept
-
-static std :: vector < char > serialize (
-
-
-RequestStatsPerIteration const & state ,
-
-
-)
+
+inline std :: string toString ( ) const
-
-
-static size_t serializedSize ( RequestStatsPerIteration const & state )
+
+
+
+
Friends
-std :: vector < char > & buffer ,
+
+friend class Serialization
-
)
-
-
-
-
-static std :: string deserializeString ( std :: istream & is )
-
-
-
-
-static bool deserializeBool ( std :: istream & is )
-
-
-
-
-static ModelType deserializeModelType ( std :: istream & is )
-
-
namespace kv_cache
+
+
+struct AgentState
+
+
Public Functions
+
+
+inline AgentState ( std :: string agentName , std :: string connectionInfo )
+
+
+AgentState ( ) = default
+
+
+
+
+inline bool operator == ( AgentState const & other ) const noexcept
+
+
+
+
+inline std :: string toString ( ) const
+
+
+
+
+
Public Members
+
+
+std :: string mAgentName
+
+
+
+
+std :: string mConnectionInfo
+
+
+
+
+
+
+
+class CacheState
+
+
Public Types
+
+
+enum class AttentionType : std :: uint8_t
+Values:
+
+
+enumerator kDEFAULT
+
+
+
+
+enumerator kMLA
+
+
+
+
+
+
+
Public Functions
+
+
+inline CacheState (
+
+
+ModelConfig modelConfig ,
+runtime :: WorldConfig const & worldConfig ,
+nvinfer1 :: DataType dataType ,
+AttentionType attentionType = AttentionType :: kDEFAULT ,
+int kvFactor = 2 ,
+
+
+)
+
+
+
+
+inline CacheState (
+
+
+std :: vector < SizeType32 > nbKvHeadPerLayer ,
+SizeType32 sizePerHead ,
+SizeType32 tokensPerBlock ,
+SizeType32 tensorParallelism ,
+SizeType32 pipelineParallelism ,
+nvinfer1 :: DataType dataType ,
+AttentionType attentionType = AttentionType :: kDEFAULT ,
+int kvFactor = 2 ,
+bool enableAttentionDP = false ,
+int DPrank = 0 ,
+int DPsize = 0 ,
+
+
+)
+
+
+
+
+inline CacheState (
+
+
+SizeType32 nbAttentionLayers ,
+SizeType32 nbKvHeads ,
+SizeType32 sizePerHead ,
+SizeType32 tokensPerBlock ,
+SizeType32 tensorParallelism ,
+SizeType32 pipelineParallelism ,
+nvinfer1 :: DataType dataType ,
+AttentionType attentionType = AttentionType :: kDEFAULT ,
+int kvFactor = 2 ,
+bool enableAttentionDP = false ,
+int DPrank = 0 ,
+int DPsize = 0 ,
+
+
+)
+
+
+
+
+inline bool operator == (
+
+
+kv_cache :: CacheState const & other ,
+
+
+) const noexcept
+
+
+
+
+inline ModelConfig const & getModelConfig ( ) const
+
+
+
+
+inline ParallelConfig const & getParallelConfig ( ) const
+
+
+
+
+inline AttentionConfig const & getAttentionConfig ( ) const
+
+
+
+
+inline nvinfer1 :: DataType const & getDataType ( ) const
+
+
+
+
+inline std :: string toString ( ) const
+
+
+
+
+
+
Friends
+
+
+friend class tensorrt_llm::executor::Serialization
+
+
+
+
+
+struct AttentionConfig
+
+
Public Functions
+
+
+inline AttentionConfig ( AttentionType attentionType , int kvFactor )
+
+
+
+
+
Public Members
+
+
+AttentionType mAttentionType
+
+
+
+
+int mKvFactor
+
+
+
+
+
+
+
+struct ModelConfig
+
+
Public Functions
+
+
+inline bool operator == ( ModelConfig const & other ) const noexcept
+
+
+
+
+
+
+
+
+struct ParallelConfig
+
+
Public Functions
+
+
+inline bool operator == ( ParallelConfig const & other ) const noexcept
+
+
+
+
+
+
+
+
+
+
+class CommState
+
+
Public Functions
+
+
+CommState ( ) = default
+
+
+
+
+inline explicit CommState (
+
+
+std :: vector < SizeType32 > ranks ,
+int selfIdx = - 1 ,
+
+
+)
+
+
+
+
+inline explicit CommState (
+
+
+std :: vector < SocketState > socketState ,
+int selfIdx = - 1 ,
+
+
+)
+
+
+
+
+inline CommState ( std :: uint16_t port , std :: string ip )
+
+
+
+
+inline explicit CommState (
+
+
+std :: vector < AgentState > agentState ,
+int selfIdx = - 1 ,
+
+
+)
+
+
+
+
+inline bool isMpiState ( ) const noexcept
+
+
+
+
+inline bool isSocketState ( ) const noexcept
+
+
+
+
+inline bool isAgentState ( ) const noexcept
+
+
+
+
+inline MpiState const & getMpiState ( ) const
+
+
+
+
+inline std :: vector < SocketState > const & getSocketState ( ) const
+
+
+
+
+inline std :: vector < AgentState > const & getAgentState ( ) const
+
+
+
+
+inline int getSelfIdx ( ) const noexcept
+
+
+
+
+inline bool operator == ( CommState const & other ) const noexcept
+
+
+
+
+inline std :: string toString ( ) const
+
+
+
+
+
Private Members
+
+
+std :: variant < std :: monostate , MpiState , std :: vector < SocketState > , std :: vector < AgentState > > mState
+
+
+
+
+int mSelfIdx = { - 1 }
+
+
+
+
+
Friends
+
+
+friend class tensorrt_llm::executor::Serialization
+
+
+
+
+
+
+
+struct MpiState
+
+
Public Functions
+
+
+inline bool operator == ( MpiState const & other ) const noexcept
+
+
+
+
+inline std :: string toString ( ) const
+
+
+
+
+
Public Members
+
+
+std :: vector < SizeType32 > mRanks
+
+
+
+
+
+
+
+struct SocketState
+
+
Public Functions
+
+
+inline bool operator == ( SocketState const & other ) const noexcept
+
+
+
+
+inline std :: string toString ( ) const
+
+
+
+
+
Public Members
+
+
+std :: uint16_t mPort
+
+
+
+
+std :: string mIp
+
+
+
+
+
+
+
@@ -10698,553 +11246,6 @@
-
-
-transferAgent.h
-
-
-namespace tensorrt_llm
-
-
-namespace executor
-
-
-namespace kv_cache
-
-
Typedefs
-
-
-using TransferDescs = MemoryDescs
-
-
-
-
-using RegisterDescs = MemoryDescs
-
-
-
-
-using SyncMessage = std :: string
-
-
-
-
-using ConnectionInfoType = std :: string
-
-
-
-
-
Enums
-
-
-enum class MemoryType : uint8_t
-Values:
-
-
-enumerator kDRAM
-
-
-
-
-enumerator kVRAM
-
-
-
-
-enumerator kBLK
-
-
-
-
-enumerator kOBJ
-
-
-
-
-enumerator kFILE
-
-
-
-
-
-
-enum class TransferOp : uint8_t
-Values:
-
-
-enumerator kREAD
-
-
-
-
-enumerator kWRITE
-
-
-
-
-
-
-
Functions
-
-
-template < typename ... Args > std :: unique_ptr < BaseTransferAgent > makeTransferAgent (
-
-
-std :: string const & backend ,
-Args & & ... args ,
-
-
-)
-
-
-
-
-
-class AgentDesc
-
-
Public Functions
-
-
-inline AgentDesc ( std :: string backendAgentDesc )
-
-
-
-
-inline std :: string const & getBackendAgentDesc ( ) const noexcept
-
-
-
-
-
Private Members
-
-
-std :: string mBackendAgentDesc
-
-
-
-
-
-
-
-struct BaseAgentConfig
-
-
Public Members
-
-
-std :: string mName
-
-
-
-
-bool useProgThread
-
-
-
-
-
-
-
-class BaseTransferAgent
-
-
Public Functions
-
-
-virtual ~BaseTransferAgent ( ) = default
-
-
-
-
-virtual void registerMemory ( RegisterDescs const & descs ) = 0
-
-
-
-
-virtual void deregisterMemory ( RegisterDescs const & descs ) = 0
-
-
-
-
-virtual void loadRemoteAgent (
-
-
-std :: string const & name ,
-AgentDesc const & agentDesc ,
-
-
-) = 0
-
-
-
-
-virtual AgentDesc getLocalAgentDesc ( ) = 0
-
-
-
-
-virtual void invalidateRemoteAgent ( std :: string const & name ) = 0
-
-
-
-
-virtual std :: unique_ptr < TransferStatus > submitTransferRequests (
-
-
-TransferRequest const & request ,
-
-
-) = 0
-
-
-
-
-virtual void notifySyncMessage (
-
-
-std :: string const & name ,
-SyncMessage const & syncMessage ,
-
-
-) = 0
-
-
-
-
-virtual std :: unordered_map < std :: string , std :: vector < SyncMessage > > getNotifiedSyncMessages (
-
-
-
-
-) = 0
-
-
-
-
-virtual ConnectionInfoType getConnectionInfo ( ) = 0
-
-
-
-
-virtual void connectRemoteAgent (
-
-
-std :: string const & name ,
-ConnectionInfoType const & connectionInfo ,
-
-
-) = 0
-
-
-
-
-virtual bool checkRemoteDescs (
-
-
-std :: string const & name ,
-MemoryDescs const & memoryDescs ,
-
-
-) = 0
-
-
-
-
-
-
-
-class DynLibLoader
-
-
Public Functions
-
-
-void * getHandle ( std :: string const & name )
-
-
-
-
-template < typename FunctionT > inline FunctionT getFunctionPointer (
-
-
-std :: string const & libName ,
-std :: string const & funcName ,
-
-
-)
-
-
-
-
-~DynLibLoader ( )
-
-
-
-
-DynLibLoader ( ) = default
-
-
-
-
-DynLibLoader ( DynLibLoader const & ) = delete
-
-
-
-
-DynLibLoader & operator = ( DynLibLoader const & ) = delete
-
-
-
-
-
Public Static Functions
-
-
-static DynLibLoader & getInstance ( )
-
-
-
-
-
Private Members
-
-
-std :: mutex mDllMutex
-
-
-
-
-std :: unordered_map < std :: string , void * > mHandlers
-
-
-
-
-
Private Static Functions
-
-
-static void * dlSym ( void * handle , char const * symbol )
-
-
-
-
-
-
-
-class MemoryDesc
-
-
Public Functions
-
-
-inline MemoryDesc (
-
-
-std :: vector < char > const & vec ,
-uint32_t deviceId = 0 ,
-
-
-)
-
-
-
-
-inline MemoryDesc ( void * addr , size_t len , uint32_t deviceId )
-
-
-
-
-inline MemoryDesc ( uintptr_t addr , size_t len , uint32_t deviceId )
-
-
-
-
-inline uintptr_t getAddr ( ) const noexcept
-
-
-
-
-inline size_t getLen ( ) const noexcept
-
-
-
-
-inline uint32_t getDeviceId ( ) const noexcept
-
-
-
-
-
Public Static Functions
-
-
-static void serialize ( MemoryDesc const & memoryDesc , std :: ostream & os )
-
-
-
-
-static MemoryDesc deserialize ( std :: istream & is )
-
-
-
-
-static size_t serializedSize ( MemoryDesc const & memoryDesc )
-
-
-
-
-
Private Members
-
-
-uintptr_t mAddr
-
-
-
-
-size_t mLen
-
-
-
-
-uint32_t mDeviceId
-
-
-
-
-
-
-
-class MemoryDescs
-
-
Public Functions
-
-
-inline MemoryDescs ( MemoryType type , std :: vector < MemoryDesc > descs )
-
-
-
-
-inline MemoryType getType ( ) const noexcept
-
-
-
-
-inline std :: vector < MemoryDesc > const & getDescs ( ) const noexcept
-
-
-
-
-
-
-
-
-class TransferRequest
-
-
Public Functions
-
-
-inline TransferRequest (
-
-
-TransferOp op ,
-TransferDescs srcDescs ,
-TransferDescs dstDescs ,
-std :: string const & remoteName ,
-std :: optional < SyncMessage > syncMessage = std :: nullopt ,
-
-
-)
-
-
-
-
-inline TransferOp getOp ( ) const noexcept
-
-
-
-
-inline TransferDescs const & getSrcDescs ( ) const noexcept
-
-
-
-
-inline TransferDescs const & getDstDescs ( ) const noexcept
-
-
-
-
-inline std :: string const & getRemoteName ( ) const noexcept
-
-
-
-
-inline std :: optional < SyncMessage > getSyncMessage ( ) const noexcept
-
-
-
-
-
-
-
-
-class TransferStatus
-
-
Public Functions
-
-
-virtual ~TransferStatus ( ) = default
-
-
-
-
-virtual bool isCompleted ( ) const = 0
-
-
-
-
-virtual void wait ( ) const = 0
-
-
-
-
-
-
-
-
-
-
-
@@ -11297,73 +11298,381 @@
@@ -13083,9 +13084,9 @@
diff --git a/latest/_cpp_gen/runtime.html b/latest/_cpp_gen/runtime.html
index f707aecbcc..9127c93bea 100644
--- a/latest/_cpp_gen/runtime.html
+++ b/latest/_cpp_gen/runtime.html
@@ -58,7 +58,7 @@
@@ -70,7 +70,7 @@
-
+
@@ -348,6 +348,7 @@
Generate text with guided decoding
Control generated text using logits processor
Generate text with multiple LoRA adapters
+Speculative Decoding
Run LLM-API with pytorch backend on Slurm
Run trtllm-bench with pytorch backend on Slurm
Run trtllm-serve with pytorch backend on Slurm
@@ -408,7 +409,7 @@
KV Cache Management: Pools, Blocks, and Events
KV cache reuse
Speculative Sampling
-Disaggregated-Service (experimental)
+Disaggregated-Service (Experimental)
Performance
@@ -496,8 +497,8 @@
Runtime
-
-gptJsonConfig.h
+
+lookaheadBuffers.h
namespace tensorrt_llm
@@ -505,4196 +506,232 @@
namespace runtime
-
-class GptJsonConfig
+
+class LookaheadDecodingBuffers
+
Public Functions
-
-inline GptJsonConfig (
+
+LookaheadDecodingBuffers (
-std :: string name ,
-std :: string version ,
-std :: string precision ,
-SizeType32 tensorParallelism ,
-SizeType32 pipelineParallelism ,
-SizeType32 contextParallelism ,
-SizeType32 gpusPerNode ,
-ModelConfig modelConfig ,
-std :: optional < RuntimeDefaults > runtimeDefaults = std :: nullopt ,
+SizeType32 maxNumSequences ,
+SizeType32 maxTokensPerStep ,
+BufferManager const & bufferManager ,
-)
-
-
-
-
-inline ModelConfig const & getModelConfig ( ) const
-
-
-
-
-inline ModelConfig & getModelConfigMutable ( )
-
-
-
-
-inline std :: string const & getName ( ) const
-
-
-
-
-inline std :: string const & getVersion ( ) const
-
-
-
-
-inline std :: string const & getPrecision ( ) const
-
-
-
-
-inline SizeType32 constexpr getTensorParallelism ( ) const
-
-
-
-
-inline SizeType32 constexpr getPipelineParallelism ( ) const
-
-
-
-
-inline SizeType32 constexpr getContextParallelism ( ) const
-
-
-
-
-inline SizeType32 constexpr getGpusPerNode ( ) const
-
-
-
-
-inline SizeType32 constexpr getWorldSize ( ) const
-
-
-
-
-inline std :: optional < RuntimeDefaults > getRuntimeDefaults ( ) const
-
-
-
-
-std :: string engineFilename (
-
-
-WorldConfig const & worldConfig ,
-std :: string const & model ,
-
-
-) const
-
-
-
-
-inline std :: string engineFilename (
-
-
-WorldConfig const & worldConfig ,
-
-
-) const
-
-
-
-
-
Public Static Functions
-
-
-static GptJsonConfig parse ( std :: string const & json )
-
-
-
-
-static GptJsonConfig parse ( std :: istream & json )
-
-
-
-
-static GptJsonConfig parse ( std :: filesystem :: path const & path )
-
-
-
-
-
Private Members
-
-
-std :: string const mName
-
-
-
-
-std :: string const mVersion
-
-
-
-
-std :: string const mPrecision
-
-
-
-
-SizeType32 const mTensorParallelism
-
-
-
-
-SizeType32 const mPipelineParallelism
-
-
-
-
-SizeType32 const mContextParallelism
-
-
-
-
-SizeType32 const mGpusPerNode
-
-
-
-
-ModelConfig mModelConfig
-
-
-
-
-std :: optional < RuntimeDefaults > mRuntimeDefaults
-
-
-
-
-
-
-
-
-
-
-
-tllmLogger.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class TllmLogger : public nvinfer1 :: ILogger
-
-
Public Functions
-
-
-void log (
-
-
-Severity severity ,
-nvinfer1 :: AsciiChar const * msg ,
-
-
-) noexcept override
-
-
-
-
-Severity getLevel ( )
-
-
-
-
-void setLevel ( Severity level )
-
-
-
-
-
-
-
-
-
-
-
-worldConfig.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class WorldConfig
-
-
Public Functions
-
-
-explicit WorldConfig (
-
-
-SizeType32 tensorParallelism = 1 ,
-SizeType32 pipelineParallelism = 1 ,
-SizeType32 contextParallelism = 1 ,
-SizeType32 rank = 0 ,
-SizeType32 gpusPerNode = kDefaultGpusPerNode ,
-std :: optional < std :: vector < SizeType32 > > const & deviceIds = std :: nullopt ,
-bool enableAttentionDP = false ,
-
-
-)
-
-
-
-
-inline SizeType32 constexpr getSize ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getTensorParallelism ( ) const noexcept
-
-
-
-
-inline bool constexpr isTensorParallel ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getPipelineParallelism ( ) const noexcept
-
-
-
-
-inline bool constexpr isPipelineParallel ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getContextParallelism ( ) const noexcept
-
-
-
-
-inline bool constexpr isContextParallel ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getGpusPerNode ( ) const noexcept
-
-
-
-
-inline SizeType32 getGpusPerGroup ( ) const noexcept
-
-
-
-
-inline SizeType32 getDevice ( ) const noexcept
-
-
-
-
-inline SizeType32 getDeviceOf ( SizeType32 rank ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getPipelineParallelRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getTensorParallelRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getContextParallelRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getLocalRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getNodeRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getNodeRankOf (
-
-
-SizeType32 rank ,
-
-
-) const noexcept
-
-
-
-
-inline bool constexpr isFirstPipelineParallelRank ( ) const noexcept
-
-
-
-
-inline bool constexpr isLastPipelineParallelRank ( ) const noexcept
-Is my rank the last rank in its pipeline?
-
-
-
-
-inline bool constexpr isFirstTensorParallelRank ( ) const noexcept
-
-
-
-
-inline bool constexpr isFirstContextParallelRank ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr getLastRank ( ) const noexcept
-
-
-
-
-inline bool constexpr enableAttentionDP ( ) const noexcept
-
-
-
-
-std :: vector < SizeType32 > getPipelineParallelGroup ( ) const
-
-
-
-
-std :: vector < SizeType32 > getTensorParallelGroup ( ) const
-
-
-
-
-std :: vector < SizeType32 > getContextParallelGroup ( ) const
-
-
-
-
-bool validMpiConfig ( ) const
-
-
-
-
-
Public Static Functions
-
-
-static WorldConfig mpi (
-
-
-SizeType32 gpusPerNode = kDefaultGpusPerNode ,
-std :: optional < SizeType32 > tensorParallelism = std :: nullopt ,
-std :: optional < SizeType32 > pipelineParallelism = std :: nullopt ,
-std :: optional < SizeType32 > contextParallelism = std :: nullopt ,
-std :: optional < std :: vector < SizeType32 > > const & deviceIds = std :: nullopt ,
-bool enableAttentionDP = false ,
-
-
-)
-
-
-
-
-
Public Static Attributes
-
-
-static SizeType32 constexpr kDefaultGpusPerNode = 1
-
-
-
-
-
-
-
-
-
-
-
-
-common.h
-
-
Defines
-
-
-FMT_DIM
-
-
-
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
Typedefs
-
-
-using SizeType32 = std :: int32_t
-
-
-
-
-using SizeType64 = std :: int64_t
-
-
-
-
-using TokenIdType = std :: int32_t
-
-
-
-
-using LoraTaskIdType = std :: uint64_t
-
-
-
-
-
-
-
-
-
-
-
-
-using VecUniqueTokens = std :: vector < UniqueToken >
-
-
-
-
-template < typename T > using StringPtrMap = std :: unordered_map < std :: string , std :: shared_ptr < T > >
-
-
-
-
-
Enums
-
-
-enum class RequestType : std :: int32_t
-Values:
-
-
-enumerator kCONTEXT
-
-
-
-
-enumerator kGENERATION
-
-
-
-
-
-
-
-struct UniqueToken
-
-
Public Functions
-
-
-inline bool operator == ( UniqueToken const & other ) const noexcept
+)
-
-
-
-
-
-
-
-
-ipcUtils.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
Functions
-
-
-void lamportInitializeAll (
-
-
-void * buffer_0 ,
-void * buffer_1 ,
-void * buffer_2 ,
-size_t size ,
-
-
-)
-
-
-
-
-bool canAccessPeer ( WorldConfig const & worldConfig )
-
-
-
-
-
-class AllReduceBuffers
-
-
-
-
-class IpcMemory
+
+class LookaheadRuntimeBuffers
Public Functions
-
-IpcMemory (
+
+LookaheadRuntimeBuffers (
-std :: size_t bufferSize ,
-BufferManager const & manager ,
-WorldConfig const & worldConfig ,
-bool openIpc = true ,
-
-
-)
-
-
-
-
-~IpcMemory ( )
-
-
-
-
-IpcMemory ( IpcMemory const & ) = delete
-
-
-
-
-IpcMemory & operator = ( IpcMemory const & ) = delete
-
-
-
-
-IpcMemory ( IpcMemory & & ) = default
-
-
-
-
-IpcMemory & operator = ( IpcMemory & & ) = default
-
-
-
-
-inline std :: vector < void * > const & getCommPtrs ( ) const
-
-
-
-
-
Public Static Attributes
-
-
-static size_t constexpr FLAGS_SIZE = ( tensorrt_llm :: kernels :: MAX_ALL_REDUCE_BLOCKS + 1 ) * sizeof ( uint32_t )
-
-
-
-
-
Private Functions
-
-
-void allocateIpcMemory (
-
-
-std :: size_t bufferSize ,
-BufferManager const & manager ,
-WorldConfig const & worldConfig ,
-
-
-)
-
-
-
-
-void destroyIpcMemory ( )
-
-
-
-
-
Private Members
-
-
-SizeType32 mTpRank
-
-
-
-
-std :: vector < void * > mCommPtrs
-
-
-
-
-BufferPtr mBuffer
-
-
-
-
-bool mOpenIpc
-
-
-
-
-
-
-
-
-
-
-
-iGptDecoderBatched.h
-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
-
-
-
-namespace runtime
-
-
-class IGptDecoderBatched
-
-#include <iGptDecoderBatched.h>
-GPT decoder class with support for in-flight batching.
-Subclassed by tensorrt_llm::runtime::GptDecoderBatched
-
-
-
Public Functions
-
-
-virtual void setup (
-
-
-executor :: DecodingMode const & mode ,
SizeType32 maxBatchSize ,
SizeType32 maxBeamWidth ,
-nvinfer1 :: DataType dtype ,
+BufferManager const & manager ,
ModelConfig const & modelConfig ,
WorldConfig const & worldConfig ,
-
-
-) = 0
-Setup the decoder before calling forward()
-
-
-
-
-RequestVector const & genRequests ,
-TensorPtr const & batchSlots ,
-
-
-) = 0
-Disable Lookahead decoding.
-
-
-
-
-virtual CudaEvent forwardAsync (
-
-
-decoder :: DecoderState const & decoderState ,
-decoder_batch :: Input const & input ,
-
-
-) = 0
-Run one step for all requests without blocking the host process and return the token for synchronization.
-
-
-
-
-virtual void forward (
-
-
-decoder :: DecoderState const & decoderState ,
-decoder_batch :: Input const & input ,
-
-
-) = 0
-Run one step for all requests and wait for completion on the host.
-
-
-
-
-virtual CudaEvent finalize (
-
-
-decoder :: DecoderState const & decoderState ,
-SizeType32 batchSlot ,
-SamplingConfig const & samplingConfig ,
-bool streaming ,
-
-
-) const = 0
-Gather final beam search results for request batchIdx . Result will only be available after event returned.
-
-
-
-
-
Protected Functions
-
-
-IGptDecoderBatched ( ) = default
-
-
-
-
-virtual ~IGptDecoderBatched ( ) = default
-
-
-
-
-
-
-
-namespace decoder
-
-
-
-
-namespace decoder_batch
-
-
-class Input
-
-
-
Public Functions
-
-
-inline explicit Input (
-
-
-std :: vector < std :: vector < TensorConstPtr > > const & logits ,
-SizeType32 maxDecoderSteps ,
-
-
-)
-
-
-
-
-inline explicit Input ( std :: vector < TensorConstPtr > const & logits )
-
-
-
-
-
Public Members
-
-
-std :: vector < std :: vector < TensorConstPtr > > logits
-[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-Mandatory parameters Logits
-
-
-
-
-SizeType32 maxDecoderSteps
-Maximum number of decoding tokens of active slots.
-
-
-
-
-std :: vector < TensorPtr > batchSlots
-Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize].
-
-
-
-
-
-
-
-
-
-
-
-
-
-eagleBuffers.h
-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
-
-
-
-namespace runtime
-
-
-class EagleBuffers
-
-
-
Public Functions
-
-
-EagleBuffers (
-
-
-SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-runtime :: BufferManager const & manager ,
-runtime :: ModelConfig const & modelConfig ,
-runtime :: WorldConfig const & worldConfig ,
executor :: DecodingConfig const & decodingConfig ,
+TllmRuntime const & runtime ,
-)
+)
-
-void reshape (
+
+void setFromInputs (
-SizeType32 numCtxSequences ,
-SizeType32 numGenSequences ,
-runtime :: ModelConfig const & modelConfig ,
-
-
-)
-
-
-
-
-void setFromInputs (
-
-
-RequestVector const & contextRequests ,
-RequestVector const & genRequests ,
-runtime :: ITensor const & requestTypes ,
-ITensor const & seqSlots ,
-EagleBuffers :: Inputs const & decoderBuffers ,
-runtime :: BufferManager const & manager ,
-runtime :: ModelConfig const & modelConfig ,
-runtime :: WorldConfig const & worldConfig ,
-
-
-) const
-
-
-
-
-void insertInputTensors (
-
-
-TensorMap & inputBuffers ,
-TensorMap & outputBuffers ,
-runtime :: WorldConfig const & worldConfig ,
-
-
-) const
-
-
-
-
-
-
-
Private Members
-
-
-std :: size_t scanReduceTempStorageBytes = { 0 }
-
-
-
-
-float mDefaultPosteriorThreshold = { 0.09f }
-
-
-
-
-bool mDoGreedySampling = { true }
-
-
-
-
-BufferPtr scanReduceTempStorage
-
-
-
-
-TensorPtr cumSumGenerationLengths
-
-
-
-
-TensorPtr maxGenerationLength
-
-
-
-
-TensorPtr chunkedContextNextTokensHost
-
-
-
-
-TensorPtr greedySamplingHost
-
-
-
-
-TensorPtr posteriorAlphaHost
-
-
-
-
-TensorPtr posteriorThresholdHost
-
-
-
-
-
-class EngineOutputs
-
-
Public Members
-
-
-TensorPtr nextDraftTokens
-[batchSize, maxDecodingDraftTokens]
-
-
-
-
-TensorPtr nextDraftLens
-[batchSize]
-
-
-
-
-TensorPtr nextDraftPaths
-[batchSize, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr acceptedTokens
-[batchSize, maxPathLen]
-
-
-
-
-TensorPtr acceptedLens
-[batchSize]
-
-
-
-
-TensorPtr acceptedPaths
-[batchSize]
-
-
-
-
-TensorPtr chunkedContextNextTokens
-[batchSize]
-
-
-
-
-
-
-
-class Inputs
-
-
Public Functions
-
-
-void create (
-
-
-SizeType32 maxNumSequences ,
-BufferManager const & manager ,
+SizeType32 numCtxSequences ,
+SizeType32 numGenSequences ,
+ITensor const & requestTypes ,
+ITensor const & seqSlots ,
+LookaheadDecodingBuffers const & decoderLookaheadBuffers ,
+TllmRuntime const & runtime ,
ModelConfig const & modelConfig ,
WorldConfig const & worldConfig ,
-)
-
-
-
-
-
Public Members
-
-
-TensorPtr temperatures
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr posteriorAlpha
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr posteriorThreshold
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr randomDataSample
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr randomDataValidation
-[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
-
-
-
-
-TensorPtr draftTokens
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-
-
-
-
-TensorPtr draftLens
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr draftPaths
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr draftPathsHost
-[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr specDecodingGenerationLengths
-[maxBatchSize] or [numGenSequences]
-
-
-
-
-TensorPtr specDecodingGenerationLengthsHost
-[maxBatchSize] or [numGenSequences]
-
-
-
-
-TensorPtr specDecodingPackedMasks
-[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-
-
-
-
-TensorPtr specDecodingPositionOffsets
-[maxBatchSize] or [numGenSequences]
-
-
-
-
-TensorPtr eagleNetCtxRequestTypesHost
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr eagleNetCtxContextLengthsHost
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr eagleNetCtxPastKeyValueLengthsHost
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr eagleNetGenRequestTypesHost
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr eagleNetGenContextLengthsHost
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr eagleNetGenPastKeyValueLengthsHost
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr inputGenTokensHost
-[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
-
-
-
-
-TensorPtr chunkedContextNextTokens
-[maxBatchSize] or [numSequences]
-
-
-
-
-TensorPtr useSpecDecoding
-[1]
-
-
-
-
-TensorPtr useDynamicTreeHost
-[1]
-
-
-
-
-TensorPtr dynamicTreeMaxTopKHost
-[1]
-
-
-
-
-TensorPtr prevScores
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-
-
-
-
-TensorPtr currentExpandIndices
-[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
-
-
-
-
-[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
-
-
-
-
-TensorPtr allLayersDraftTokenIds
-[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
-
-
-
-
-TensorPtr allLayersDraftTokenIdsPredecessor
-[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
-
-
-
-
-
-
-
-
-
-
-
-
-
-samplingConfig.h
-
-
Defines
-
-
-SET_FROM_OPTIONAL ( varName , VarName , VarType )
-
-
-
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class SamplingConfig
-
-
Public Functions
-
-
-inline explicit SamplingConfig ( SizeType32 beamWidth = 1 )
+) const
-
-inline explicit SamplingConfig (
+
+void reshape (
-std :: vector < SamplingConfig > const & configs ,
+SizeType32 numCtxSequences ,
+SizeType32 numGenSequences ,
+SizeType32 tokensPerStep ,
-)
+)
-
-inline explicit SamplingConfig (
+
+void insertInputTensors (
-executor :: SamplingConfig const & samplingConfig ,
-std :: optional < executor :: ExternalDraftTokensConfig > const & externalDraftTokensConfig = std :: nullopt ,
+TensorMap & inputBuffers ,
+TensorMap & outputBuffers ,
+WorldConfig const & worldConfig ,
-)
+) const
-
-inline bool validate ( )
-
-
-
-
-template < typename T > inline bool useDefaultValues (
-
-
-OptVec < T > const & vec ,
-T defaultValue ,
-
-
-)
-
-
-
-
-inline bool operator == ( SamplingConfig const & other ) const
-
-
-
-
-inline SizeType32 getNumReturnBeams ( ) const
-
-
-
-
-inline SizeType32 getMaxBeamWidth ( ) const noexcept
-
-
-
-
-
-
Private Types
-
-
-using FloatType = float
-
-
-
-
-template < typename T > using OptVec = std :: optional < std :: vector < T > >
-
-
-
-
-
Private Functions
-
-
-template < typename T > inline bool validateVec (
-
-
-std :: string name ,
-OptVec < T > const & vec ,
-T min ,
-std :: optional < T > max = std :: nullopt ,
-
-
-)
-
-
-
-
-
Private Static Functions
-
-
-template < typename T > static inline OptVec < T > fuseValues (
-
-
-std :: vector < SamplingConfig > const & configs ,
-std :: function < OptVec < T > ( size_t ci ) > accessor ,
-T defaultValue ,
-
-
-)
-
-
-
-
-
-
-
-
-
-
-
-speculativeDecodingMode.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class SpeculativeDecodingMode
-
-
Public Types
-
-
-using UnderlyingType = std :: uint8_t
-
-
-
-
-
Public Functions
-
-
-inline bool constexpr isNone ( ) const
-
-
-
-
-inline bool constexpr isDraftTokensExternal ( ) const
-
-
-
-
-inline bool constexpr isMedusa ( ) const
-
-
-
-
-inline bool constexpr isLookaheadDecoding ( ) const
-
-
-
-
-inline bool constexpr isExplicitDraftTokens ( ) const
-
-
-
-
-inline bool constexpr isEagle ( ) const
-
-
-
-
-inline bool constexpr updatesPositionIds ( ) const
-
-
-
-
-inline bool constexpr requiresAttentionMask ( ) const
-
-
-
-
-inline bool constexpr predictsDraftTokens ( ) const
-
-
-
-
-inline bool constexpr needsKVCacheRewind ( ) const
-
-
-
-
-inline bool constexpr variableDraftLength ( ) const
-
-
-
-
-inline bool constexpr hasDraftLogits ( ) const
-
-
-
-
-inline bool constexpr needsDecoderPrologue ( ) const
-
-
-
-
-inline bool operator == ( SpeculativeDecodingMode const & other ) const
-
-
-
-
-inline explicit constexpr SpeculativeDecodingMode (
-
-
-UnderlyingType state ,
-
-
-)
-
-
-
-
-
Public Static Functions
-
-
-static inline auto constexpr None ( )
-
-
-
-
-static inline auto constexpr DraftTokensExternal ( )
-
-
-
-
-static inline auto constexpr Medusa ( )
-
-
-
-
-static inline auto constexpr LookaheadDecoding ( )
-
-
-
-
-static inline auto constexpr ExplicitDraftTokens ( )
-
-
-
-
-static inline auto constexpr Eagle ( )
-
-
-
-
-
Private Functions
-
-
-inline bool constexpr anyBitSet ( UnderlyingType bits ) const
-
-
-
-
-inline bool constexpr allBitSet ( UnderlyingType bits ) const
-
-
-
-
-
-
Private Static Attributes
-
-
-static UnderlyingType constexpr kNone = { 1U << 0U }
-
-
-
-
-static UnderlyingType constexpr kDraftTokensExternal = { 1U << 1U }
-
-
-
-
-static UnderlyingType constexpr kMedusa = { 1U << 2U }
-
-
-
-
-static UnderlyingType constexpr kLookaheadDecoding = { 1U << 3U }
-
-
-
-
-static UnderlyingType constexpr kExplicitDraftTokens = { 1U << 4U }
-
-
-
-
-static UnderlyingType constexpr kEagle = { 1U << 5U }
-
-
-
-
-
-
-
-
-
-
-
-memoryCounters.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class MemoryCounters
-
-
Public Types
-
-
-using SizeType32 = std :: size_t
-
-
-
-
-using DiffType = std :: ptrdiff_t
-
-
-
-
-
-
Public Static Functions
-
-
-static MemoryCounters & getInstance ( )
-
-
-
-
-static std :: string bytesToString ( SizeType32 bytes , int precision = 2 )
-
-
-
-
-static std :: string bytesToString ( DiffType bytes , int precision = 2 )
-
-
-
-
-
Private Members
-
-
-std :: atomic < SizeType32 > mGpu = { }
-
-
-
-
-std :: atomic < SizeType32 > mCpu = { }
-
-
-
-
-std :: atomic < SizeType32 > mPinned = { }
-
-
-
-
-std :: atomic < SizeType32 > mUVM = { }
-
-
-
-
-std :: atomic < SizeType32 > mPinnedPool = { }
-
-
-
-
-std :: atomic < DiffType > mGpuDiff = { }
-
-
-
-
-std :: atomic < DiffType > mCpuDiff = { }
-
-
-
-
-std :: atomic < DiffType > mPinnedDiff = { }
-
-
-
-
-std :: atomic < DiffType > mUVMDiff = { }
-
-
-
-
-std :: atomic < DiffType > mPinnedPoolDiff = { }
-
-
-
-
-
-
-
-
-
-
-
-runtimeDefaults.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-struct RuntimeDefaults
-
-
Public Functions
-
-
-inline RuntimeDefaults (
-
-
-std :: optional < std :: vector < SizeType32 > > maxAttentionWindowVec ,
-std :: optional < SizeType32 > sinkTokenLength ,
-
-
-)
-
-
-
-
-RuntimeDefaults ( ) = default
-
-
-
-
-
Public Members
-
-
-std :: optional < std :: vector < SizeType32 > > maxAttentionWindowVec
-
-
-
-
-std :: optional < SizeType32 > sinkTokenLength
-
-
-
-
-
-
-
-
-
-
-
-decodingOutput.h
-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
-
-
-
-namespace runtime
-
-
-class DecodingOutput
-
-
-
Public Functions
-
-
-DecodingOutput ( ) = default
-
-
-
-
-
Public Members
-
-
-TensorPtr ids
-Mandatory parameters Previously generated token ids for all steps before DecodingInput.step , [BS, BM, MSL]
-
-
-
-
-TensorPtr gatheredIds
-The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids .
-
-
-
-
-TensorPtr newTokensSteps
-New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
-
-
-
-
-TensorPtr newTokens
-A view of newTokensSteps for the current token, [BS, BM].
-
-
-
-
-std :: vector < TensorPtr > newTokensVec
-A Vector of views on newTokensSteps for each token [BS, BM].
-
-
-
-
-TensorPtr finishReasons
-Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
-
-
-
-
-TensorPtr finishedSum
-The sum of finished sequences per request, in pinned memory, [BS].
-
-
-
-
-TensorPtr logProbs
-Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
-
-
-
-
-TensorPtr cumLogProbs
-Sum log-probility of all generated tokens, [BS, BM].
-
-
-
-
-TensorPtr parentIds
-Index of the beam where the previous token is, [BS, BM, MSL].
-
-
-
-
-TensorPtr lengths
-Total sequence lengths including padding, [BS, BM].
-
-
-
-
-TensorPtr cacheIndirection
-K/V indirection for next generation step, [BS, BM, MSL].
-
-
-
-
-TensorPtr logProbsTiled
-Buffer used to store the transpose of the logProbs, [MSL, BS, BM].
-
-
-
-
-BeamHypotheses beamHypotheses
-
-
-
-
-std :: optional < SpeculativeDecodingOutputs > speculativeDecodingOutputs
-
-
-
-
-std :: optional < ExplicitDraftTokensBuffers :: Inputs > explicitDraftTokensBuffers
-
-
-
-
-std :: optional < LookaheadDecodingBuffers > lookaheadOutputs
-
-
-
-
-std :: optional < EagleBuffers :: Inputs > eagleBuffers
-
-
-
-
-
Public Static Attributes
-
-
-static float constexpr kNegativeInfinity = - 1e20f
-
-
-
-
-
-class BeamHypotheses
-
-
-
-
-
-
-class SpeculativeDecodingOutputs
-
-
-
-
-
-
-
-
-
-
-
-decoderState.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-namespace decoder
-
-
-class BeamSearchBuffers
-
-
Public Functions
-
-
-explicit BeamSearchBuffers ( BufferManager const & bufferManager )
-
-
-
-
-void reshape ( SizeType32 maxBeamWidth , SizeType32 maxSequenceLength )
-
-
-
-
-
-
-
-
-class DecoderState
-
-
-
Public Functions
-
-
-DecoderState ( )
-
-
-
-
-void setup (
+
+void enableLookaheadDecoding (
SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-SizeType32 maxAttentionWindow ,
-SizeType32 sinkTokenLength ,
-SizeType32 maxSequenceLength ,
-nvinfer1 :: DataType dtype ,
-ModelConfig const & modelConfig ,
-WorldConfig const & worldConfig ,
-BufferManager const & bufferManager ,
+SizeType32 tokensPerStep ,
-)
-Setup buffers for the decoder excluding speculative decoding.
-
-
-
-
-void setupCacheIndirection (
-
-
-SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-SizeType32 maxAttentionWindow ,
-BufferManager const & bufferManager ,
-
-
-)
-Setup buffers for the cache indirection.
-This is used for beam search on pipeline parallel ranks without a decoder.
-
-
-
-
-void setupSpeculativeDecoding (
-
-
-SpeculativeDecodingMode const & speculativeDecodingMode ,
-SizeType32 maxTokensPerEngineStep ,
-nvinfer1 :: DataType dtype ,
-ModelConfig const & modelConfig ,
-WorldConfig const & worldConfig ,
-BufferManager const & bufferManager ,
-
-
-)
-Setup buffers for speculative decoding.
-
-
-
-
-Disable lookahead decoding.
-
-
-
-
-TensorPtr getFinishedSum ( ) const
-
-Returns:
-[batchSize], number of finished sequences per request, on gpu
-
-
-
-
-
-
-TensorPtr getFinishReasons ( ) const
-
-Returns:
-[batchSize, beamWidth], FinishedState value, on gpu
-
-
-
-
-
-
-TensorPtr getIds ( ) const
-
-Returns:
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
-
-
-
-
-
-
-TensorPtr getIds ( SizeType32 batchIdx ) const
-
-Parameters:
-batchIdx – index of the batch
-
-Returns:
-[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx , on gpu. In case of beam search, contains the ungathered data.
-
-
-
-
-
-
-TensorPtr getGatheredIds ( ) const
-
-Returns:
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu.
-
-
-
-
-
-
-TensorPtr getGatheredIds ( SizeType32 batchIdx ) const
-
-Parameters:
-batchIdx – index of the batch
-
-Returns:
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx , on gpu.
-
-
-
-
-
-
-TensorPtr getParentIds ( ) const
-
-Returns:
-[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-
-
-
-
-
-
-TensorPtr getCumLogProbs ( ) const
-
-Returns:
-[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
-
-
-
-
-
-TensorPtr getCumLogProbs ( SizeType32 batchIdx ) const
-
-Returns:
-[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
-
-
-
-
-
-TensorPtr getLogProbs ( ) const
-
-Returns:
-[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
-
-
-
-
-
-TensorPtr getLogProbs ( SizeType32 batchIdx ) const
-
-Returns:
-[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
-
-
-
-
-
-TensorPtr getSequenceLengths ( ) const
-
-Returns:
-[batchSize, maxBeamWidth], sequence lengths, on gpu
-
-
-
-
-
-
-TensorPtr getSequenceLengths ( SizeType32 batchIdx ) const
-
-Parameters:
-batchIdx – index of the batch
-
-Returns:
-[maxBeamWidth], sequence lengths for request batchIdx , on gpu
-
-
-
-
-
-
-TensorPtr getAllNewTokens ( ) const
-Get maxTokensPerStep tokens generated in the last forward pass.
-
-Returns:
-[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
-
-
-
-
-
-TensorPtr getNextDraftTokens ( ) const
-
-Returns:
-[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
-
-
-
-
-
-
-TensorPtr getPrevDraftTokensLengths ( ) const
-
-Returns:
-[batchSize], predicted draft tokens lengths for previous step, on gpu
-
-
-
-
-
-
-TensorPtr getNextDraftTokensLengths ( ) const
-
-Returns:
-[batchSize], predicted draft tokens lengths for next step, on gpu
-
-
-
-
-
-
-TensorPtr getAcceptedLengthsCumSum ( ) const
-
-Returns:
-[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
-
-
-
-
-
-
-TensorPtr getAcceptedPackedPaths ( ) const
-
-Returns:
-[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
-
-
-
-
-
-
-TensorPtr getFinishedSteps ( ) const
-
-Returns:
-[maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
-
-
-
-
-
-
-SizeType32 getMaxBatchSize ( ) const
+)
-
-SizeType32 getMaxBeamWidth ( ) const
-
-
-
-
-SizeType32 getMaxSequenceLength ( ) const
-
-
-
-
-SizeType32 getMaxDecodingDecoderTokens ( ) const
-
-
-
-
-SizeType32 getMaxDecodingEngineTokens ( ) const
-
-
-
-
-std :: vector < SizeType32 > const & getNumDecodingEngineTokens ( ) const
-Get the number of tokens for all requests in the batch.
-
-Returns:
-The number of tokens for all requests in the batch.
-
-
-
-
-
-
-SizeType32 getNumDecodingEngineTokens ( SizeType32 batchIdx ) const
-Get the number of tokens for a specific request in the batch.
-
-Parameters:
-batchIdx – The index of the request in the batch.
-
-Returns:
-The number of tokens for the specified request.
-
-
-
-
-
-
-void setNumDecodingEngineTokens (
-
-
-SizeType32 batchIdx ,
-SizeType32 numTokens ,
-
-
-)
-Set the number of tokens for a specific request in the batch.
-
-Parameters:
-
-
-
-
-
-
-
-SpeculativeDecodingMode getSpeculativeDecodingMode ( ) const
-Get the speculative decoding mode.
-
-
-
-
-ExplicitDraftTokensBuffers :: Inputs const & getExplicitDraftTokensBuffers (
-
-
-
-
-) const
-Get the explicit draft tokens buffers.
-
-
-
-
-EagleBuffers :: Inputs const & getEagleBuffers ( ) const
-Get the eagle buffers.
-
-
-
-
-LookaheadDecodingBuffers const & getLookaheadBuffers ( ) const
-Get the lookahead buffers.
-
-
-
-
-BeamSearchBuffers const & getBeamSearchBuffers ( ) const
-Workspace for beam search in streaming mode.
-
-
-
-
-TensorPtr getCacheIndirectionInput ( ) const
-Cache indirection input for beam search.
-
-
-
-
-TensorPtr getCacheIndirectionOutput ( ) const
-Cache indirection output for beam search.
-
-
-
-
-std :: optional < std :: vector < SizeType32 > > const & getGenerationSteps (
-
-
-
-
-) const
-Get the generation steps for all requests in the batch.
-
-Returns:
-The generation steps for all requests in the batch.
-
-
-
-
-
-
-void setGenerationSteps (
-
-
-std :: vector < SizeType32 > const & generationSteps ,
-
-
-)
-Set the generation steps for all requests in the batch.
-
-Parameters:
-generationSteps – The generation steps for all requests in the batch.
-
-
-
-
-
-
-DecodingInput & getJointDecodingInput ( ) const
-Stateful inputs for the decoder. Allocated for maxBatchSize slots.
-
-
-
-
-DecodingOutput & getJointDecodingOutput ( ) const
-Stateful outputs for the decoder. Allocated for maxBatchSize slots.
-
-
-
-
-
Private Functions
-
-
-void setupBuffers (
-
-
-nvinfer1 :: DataType dtype ,
-BufferManager const & bufferManager ,
-
-
-)
-
-
-
-
-void reshapeBuffers (
-
-
-SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-SizeType32 maxAttentionWindow ,
-SizeType32 sinkTokenLength ,
-SizeType32 maxSequenceLength ,
-ModelConfig const & modelConfig ,
-WorldConfig const & worldConfig ,
-BufferManager const & bufferManager ,
-
-
-)
-
-
-
-
-void setupCacheIndirectionBuffers ( BufferManager const & bufferManager )
-
-
-
-
-void reshapeCacheIndirectionBuffers (
-
-
-SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-SizeType32 maxAttentionWindow ,
-
-
-)
-
-
-
-
-void setupSpeculativeDecodingBuffers (
-
-
-SpeculativeDecodingMode speculativeDecodingMode ,
-nvinfer1 :: DataType dtype ,
-BufferManager const & bufferManager ,
-
-
-)
-
-
-
-
-void reshapeSpeculativeDecodingBuffers (
-
-
-SpeculativeDecodingMode const & speculativeDecodingMode ,
-SizeType32 maxTokensPerEngineStep ,
-ModelConfig const & modelConfig ,
-WorldConfig const & worldConfig ,
-BufferManager const & bufferManager ,
-
-
-)
-
-
-
-
-
Private Members
-
-
-SizeType32 mMaxBatchSize = { }
-
-
-
-
-SizeType32 mMaxBeamWidth = { }
-
-
-
-
-SizeType32 mMaxSequenceLength = { }
-
-
-
-
-DecodingInputPtr mJointDecodingInput
-Stateful inputs for the decoder. Allocated for maxBatchSize slots.
-
-
-
-
-DecodingOutputPtr mJointDecodingOutput
-Stateful outputs for the decoder. Allocated for maxBatchSize slots.
-
-
-
-
-TensorPtr mFinishedSteps
-[maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token of maxTokensPerStep, on gpu
-
-
-
-
-std :: unique_ptr < BeamSearchBuffers > mBeamSearchBuffers
-Workspace for beam search in streaming mode.
-
-
-
-
-SizeType32 mMaxDecodingDecoderTokens = { 1 }
-
-
-
-
-SizeType32 mMaxDecodingEngineTokens = { 1 }
-
-
-
-
-std :: vector < SizeType32 > mNumDecodingEngineTokens
-[batchSize], the num tokens of each request.
-
-
-
-
-SpeculativeDecodingMode mSpeculativeDecodingMode = { SpeculativeDecodingMode :: None ( ) }
-
-
-
-
-
-
-
-
-
-
-
-
-
-gptDecoder.h
-
-
-namespace tensorrt_llm
-
-
-namespace layers
-
-
-
-
-namespace runtime
-
-
Functions
-
-
-inline runtime :: ITensor :: SharedConstPtr getDefaultBatchSlots (
-
-
-runtime :: SizeType32 batchSize ,
-
-
-)
-Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
-
-
-
-
-
-template < typename T > class GptDecoder : public virtual tensorrt_llm :: runtime :: IGptDecoder
-
-
-
Public Functions
-
-
-GptDecoder (
-
-
-executor :: DecodingMode const & mode ,
-size_t maxBatchSize ,
-size_t maxBeamWidth ,
-size_t vocabSize ,
-size_t vocabSizePadded ,
-CudaStreamPtr const & stream ,
-std :: shared_ptr < SpeculativeDecodingModule const > speculativeDecodingModule = nullptr ,
-
-
-)
-
-
-
-
-virtual void setup (
-
-
-SamplingConfig const & samplingConfig ,
-size_t batchSize ,
-TensorConstPtr const & batchSlots ,
-std :: optional < DecodingOutput > const & output = std :: nullopt ,
-std :: optional < nvinfer1 :: DataType > explicitDraftTokensDType = std :: nullopt ,
-std :: optional < std :: vector < TensorConstPtr > > const & lookaheadPrompt = std :: nullopt ,
-std :: optional < std :: vector < executor :: LookaheadDecodingConfig > > const & lookaheadAlgoConfigs = std :: nullopt ,
-
-
-) override
-
-Parameters:
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-
-
-
-
-
-
-virtual void forwardAsync (
-
-
-DecodingOutput & output ,
-DecodingInput const & input ,
-
-
-) override
-
-
-
-
-virtual void forwardSync (
-
-
-DecodingOutput & output ,
-DecodingInput const & input ,
-
-
-) override
-
-
-
-
-inline virtual SamplingConfig const & getSamplingConfig ( ) override
-
-
-
-
-std :: optional < SamplingConfig > const & samplingConfig ,
-SizeType32 batchSize ,
-TensorConstPtr batchSlots ,
-
-
-) override
-
-
-
-
-
Private Members
-
-
-std :: shared_ptr < BufferManager > mManager
-
-
-
-
-std :: shared_ptr < tensorrt_llm :: layers :: DynamicDecodeLayer < T > > mDynamicDecodeLayer
-
-
-
-
-std :: shared_ptr < tensorrt_llm :: runtime :: DecodingLayerWorkspace > mDecodingLayerWorkspace
-
-
-
-
-SamplingConfig mSamplingConfig
-
-
-
-
-size_t mMaxBatchSize
-
-
-
-
-size_t mVocabSize
-
-
-
-
-size_t mVocabSizePadded
-
-
-
-
-executor :: DecodingMode mDecodingMode
-
-
-
-
-
-
-
-class IGptDecoder
-Subclassed by tensorrt_llm::runtime::GptDecoder< T >
-
-
-
Public Functions
-
-
-virtual ~IGptDecoder ( ) = default
-
-
-
-
-virtual void setup (
-
-
-SamplingConfig const & samplingConfig ,
-size_t batchSize ,
-TensorConstPtr const & batchSlots ,
-std :: optional < DecodingOutput > const & output = std :: nullopt ,
-std :: optional < nvinfer1 :: DataType > explicitDraftTokensDType = std :: nullopt ,
-std :: optional < std :: vector < TensorConstPtr > > const & lookaheadPrompt = std :: nullopt ,
-std :: optional < std :: vector < executor :: LookaheadDecodingConfig > > const & lookaheadAlgoConfigs = std :: nullopt ,
-
-
-) = 0
-
-Parameters:
-explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
-
-
-
-
-
-
-virtual void forwardAsync (
-
-
-DecodingOutput & output ,
-DecodingInput const & input ,
-
-
-) = 0
-
-
-
-
-virtual void forwardSync (
-
-
-DecodingOutput & output ,
-DecodingInput const & input ,
-
-
-) = 0
-
-
-
-
-virtual SamplingConfig const & getSamplingConfig ( ) = 0
-
-
-
-
-std :: optional < SamplingConfig > const & samplingConfig ,
-SizeType32 batchSize ,
-TensorConstPtr batchSlots ,
-
-
-) = 0
-
-
-
-
-
-
-
-
-
-
-
-
-explicitDraftTokensBuffers.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class ExplicitDraftTokensBuffers
-
-
-
Public Functions
-
-
-ExplicitDraftTokensBuffers (
-
-
-SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-runtime :: BufferManager const & manager ,
-runtime :: ModelConfig const & modelConfig ,
-runtime :: WorldConfig const & worldConfig ,
-
-
-)
-
-
-
-
-void reshape (
-
-
-SizeType32 numCtxSequences ,
-SizeType32 numGenSequences ,
-runtime :: ModelConfig const & modelConfig ,
-
-
-)
-
-
-
-
-void setFromInputs (
-
-
-SizeType32 numCtxSequences ,
-SizeType32 numGenSequences ,
-runtime :: ITensor const & requestTypes ,
-ITensor const & seqSlots ,
-ExplicitDraftTokensBuffers :: Inputs const & decoderBuffers ,
-ITensor const & contextPositionIds ,
-runtime :: ModelConfig const & modelConfig ,
-runtime :: WorldConfig const & worldConfig ,
-runtime :: BufferManager const & manager ,
-runtime :: CudaStream const & stream ,
-
-
-) const
-
-
-
-
-void insertInputTensors (
-
-
-TensorMap & inputBuffers ,
-TensorMap & outputBuffers ,
-runtime :: WorldConfig const & worldConfig ,
-
-
-) const
+
+void disableLookaheadDecoding ( )
-
-
-
-class EngineInputs : public tensorrt_llm :: runtime :: ExplicitDraftTokensBuffers :: Inputs
-
-
Public Members
-
-
-TensorPtr requestTypesDevice
-[numSequences], on gpu
-
-
-
-
-TensorPtr positionOffsets
-[numGenSequences]
-
-
-
-
-
-
-
-class EngineOutputs
-
-
Public Members
-
-
-TensorPtr nextGenerationLengths
-[batchSize]
-
-
-
-
-TensorPtr nextPositionOffsets
-[batchSize]
-
-
-
-
-TensorPtr masks
-[batchSize, maxDecodingTokens, maxDecodingTokens], bool
-
-
-
-
-TensorPtr nextDraftTokens
-[batchSize, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr nextDraftIndices
-[batchSize, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr nextDraftProbs
-[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]
-
-
-
-
-TensorPtr nextFlatTokens
-[batchSize * maxDecodingTokens]
-
-
-
-
-TensorPtr bestPathLengths
-[batchSize]
-
-
-
-
-TensorPtr bestPathIndices
-[batchSize]
-
-
-
-
-TensorPtr maxGenToken
-[1]
-
-
-
-
-TensorPtr totalGenToken
-[1]
-
-
-
-
-TensorPtr packedPositionIds
-[batchSize * maxDecodingTokens]
-
-
-
-
-
-
-
-class Inputs
-Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
-
-
-
Public Members
-
-
-TensorPtr temperatures
-[maxBatchSize]
-
-
-
-
-TensorPtr positionIdsBase
-[maxBatchSize]
-
-
-
-
-TensorPtr generationLengths
-[maxBatchSize] or [numGenSequences]
-
-
-
-
-TensorPtr randomDataSample
-[maxBatchSize]
-
-
-
-
-TensorPtr randomDataValidation
-[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
-
-
-
-
-TensorPtr draftTokens
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr draftIndices
-[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
-
-
-
-
-TensorPtr draftProbs
-[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
-
-
-
-
-TensorPtr packedMasks
-[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
-
-
-
-
-TensorPtr positionIds
-[maxBatchSize] or [numGenSequences]
-
-
-
-
-TensorPtr maxGenLengthHost
-
-
-
-
-TensorPtr generationLengthsHost
-
-
-
-
-TensorPtr useSpecDecoding
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-bufferManager.h
+
+iBuffer.h
namespace tensorrt_llm
namespace runtime
-
-
-class BufferManager
-
-#include <bufferManager.h>
-A helper class for managing memory on host and device.
-
-
Public Types
+
+
Typedefs
-
-using IBufferPtr = IBuffer :: UniquePtr
-
-
-
-
-using ITensorPtr = ITensor :: UniquePtr
-
-
-
-
-using CudaStreamPtr = std :: shared_ptr < CudaStream >
-
-
-
-
-using CudaMemPoolPtr = std :: shared_ptr < CudaMemPool >
+
+template < typename T > using PointerElementType = typename std :: remove_reference_t < T > :: element_type
-
Public Functions
+
Enums
+
+
+enum class MemoryType : std :: int32_t
+Values:
+
+
+enumerator kGPU
+
+
+
+
+enumerator kCPU
+
+
+
+
+enumerator kPINNED
+
+
+
+
+enumerator kUVM
+
+
+
+
+enumerator kPINNEDPOOL
+
+
+
+
+
+
+
Functions
-
-explicit BufferManager ( CudaStreamPtr stream , bool trimPool = false )
-Construct a BufferManager .
+
+template < typename T > std :: shared_ptr < std :: remove_const_t < T > > constPointerCast (
+
+
+std :: shared_ptr < T > const & ptr ,
+
+
+) noexcept
+
+
+
+
+template < typename T , typename D > std :: shared_ptr < std :: remove_const_t < T > > constPointerCast (
+
+
+std :: unique_ptr < T , D > & & ptr ,
+
+
+) noexcept
+
+
+
+
+template < typename T > T const * bufferCast ( IBuffer const & buffer )
+Gets a typed pointer to the constant underlying data of the buffer.
-Parameters:
-cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+buffer – The buffer to get a pointer to.
+
+Returns:
+A pointer to constant T .
-
-inline ~BufferManager ( )
-Destructor.
+
+template < typename T > T * bufferCast ( IBuffer & buffer )
+Gets a typed pointer to the underlying data of the buffer.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+buffer – The buffer to get a pointer to.
+
+Returns:
+A pointer to T .
+
+
-
-IBufferPtr gpu (
+
+template < typename T > T * bufferCastOrNull (
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
+IBuffer :: SharedPtr const & bufferPtr ,
-) const
-Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
+)
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+bufferPtr – A possibly null shared ptr.
+
+Returns:
+A pointer to T, possibly nullptr.
+
+
-
-ITensorPtr gpu (
+
+template < typename T > T const * bufferCastOrNull (
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
+IBuffer :: SharedConstPtr const & bufferPtr ,
-) const
-Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
+)
+Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+bufferPtr – A possibly null shared ptr.
+
+Returns:
+A pointer to const T, possibly nullptr.
+
+
-
-IBufferPtr allocate (
+
+template < typename T > T * bufferCastOrNull (
-MemoryType memoryType ,
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
+std :: optional < IBuffer :: SharedPtr > const & optionalBufferPtr ,
-) const
-Allocates an IBuffer of the given size and memory type.
+)
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+optionalBufferPtr – A possibly empty optional.
+
+Returns:
+A pointer to T, possibly nullptr.
+
+
-
-ITensorPtr allocate (
+
+template < typename T > T const * bufferCastOrNull (
-MemoryType memoryType ,
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
+std :: optional < IBuffer :: SharedConstPtr > const & optionalBufferPtr ,
-) const
-Allocates an ITensor of the given dimensions and memory type.
+)
+Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+optionalBufferPtr – A possibly empty optional.
+
+Returns:
+A pointer to const T, possibly nullptr.
+
+
-
-inline IBufferPtr emptyBuffer (
+
+std :: ostream & operator << (
-MemoryType memoryType ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
+std :: ostream & output ,
+IBuffer const & buffer ,
-) const
-Create an empty IBuffer of the given memory type. It may be resized later.
-
-
-
-
-inline ITensorPtr emptyTensor (
-
-
-MemoryType memoryType ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-) const
-Create an empty ITensor of the given memory type. It may be reshaped later.
-
-
-
-
-void setMem ( IBuffer & buffer , int32_t value ) const
-Set the contents of the given buffer to value.
-
-
-
-
-void setZero ( IBuffer & buffer ) const
-Set the contents of the given buffer to zero.
-
-
-
-
-void copy ( void const * src , IBuffer & dst , MemoryType srcType ) const
-Copy src to dst .
-
-
-
-
-void copy ( IBuffer const & src , void * dst , MemoryType dstType ) const
-Copy src to dst .
-
-
-
-
-inline void copy ( void const * src , IBuffer & dst ) const
-Copy src to dst .
-
-
-
-
-inline void copy ( IBuffer const & src , void * dst ) const
-Copy src to dst .
-
-
-
-
-void copy ( IBuffer const & src , IBuffer & dst ) const
-Copy src to dst .
-
-
-
-
-IBufferPtr copyFrom ( IBuffer const & src , MemoryType memoryType ) const
-Copy src into a new IBuffer with a potentially different memory type.
-
-
-
-
-ITensorPtr copyFrom ( ITensor const & src , MemoryType memoryType ) const
-Copy src into a new ITensor with a potentially different memory type.
-
-
-
-
-template < typename T > inline IBufferPtr copyFrom (
-
-
-std :: vector < T > const & src ,
-MemoryType memoryType ,
-
-
-) const
-Copy src into a new IBuffer with a potentially different memory type.
-
-
-
-
-template < typename T > inline ITensorPtr copyFrom (
-
-
-T * src ,
-nvinfer1 :: Dims dims ,
-MemoryType memoryType ,
-
-
-) const
-Copy src into a new ITensor with a potentially different memory type.
-
-
-
-
-template < typename T > inline ITensorPtr copyFrom (
-
-
-std :: vector < T > const & src ,
-nvinfer1 :: Dims dims ,
-MemoryType memoryType ,
-
-
-) const
-Copy src into a new ITensor with a potentially different memory type.
-
-
-
-
-CudaStream const & getStream ( ) const
-Get the underlying cuda stream.
-
-
-
-
-std :: size_t memoryPoolReserved ( ) const
-The current size of the memory reserved by the memory pool.
-
-
-
-
-std :: size_t memoryPoolUsed ( ) const
-The current size of the memory used by the memory pool.
-
-
-
-
-std :: size_t memoryPoolFree ( ) const
-The current size of the memory free in the memory pool.
-
-
-
-
-void memoryPoolTrimTo ( std :: size_t size )
-Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
+)
+Utility function to print a buffer.
+
+
+class BufferDataType
+
+#include <iBuffer.h>
+A wrapper around nvinfer1::DataType that provides a support for pointer types.
-
Public Static Functions
+
Public Functions
-
-static IBufferPtr gpuSync (
+
+inline constexpr BufferDataType (
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
+nvinfer1 :: DataType dataType ,
+bool _unsigned = false ,
+bool pointer = false ,
-)
-Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
-
+
)
+
-
-static ITensorPtr gpuSync (
-
-
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
-
+
+inline constexpr operator nvinfer1 :: DataType ( ) const noexcept
+
-
-static IBufferPtr cpu (
-
-
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates an IBuffer of the given size on the CPU.
-
+
+inline constexpr nvinfer1 :: DataType getDataType ( ) const noexcept
+
-
-static ITensorPtr cpu (
-
-
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates an ITensor of the given dimensions on the CPU.
-
+
+inline constexpr bool isPointer ( ) const noexcept
+
-
-static IBufferPtr pinned (
-
-
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates a pinned IBuffer of the given size on the CPU.
-
+
+inline constexpr bool isUnsigned ( ) const
+
-
-static ITensorPtr pinned (
-
-
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates a pinned ITensor of the given dimensions on the CPU.
-
+
+inline constexpr std :: size_t getSize ( ) const noexcept
+
-
-static IBufferPtr pinnedPool (
-
-
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
-
-
-
-
-static ITensorPtr pinnedPool (
-
-
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
-
-
-
-
-static IBufferPtr managed (
-
-
-std :: size_t size ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates an IBuffer of the given size in UVM.
-
-
-
-
-static ITensorPtr managed (
-
-
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type = kBYTE_TYPE ,
-
-
-)
-Allocates an ITensor of the given dimensions in UVM.
-
-
-
-
-static ITensorPtr ipcNvls (
-
-
-std :: set < int > ranks ,
-nvinfer1 :: Dims dims ,
-nvinfer1 :: DataType type ,
-
-
-)
-Allocates an ITensor of the given dimensions for NVLS.
-
+
+inline constexpr std :: size_t getSizeInBits ( ) const noexcept
+
Public Static Attributes
-
-static auto constexpr kBYTE_TYPE = nvinfer1 :: DataType :: kUINT8
+
+static auto constexpr kTrtPointerType = nvinfer1 :: DataType :: kINT64
-
-
Friends
-
-
-friend class ::BufferManagerTest
-
-
-
-
-
-
-
-
-
-
-
-rawEngine.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class RawEngine
-
-
Public Types
-
-
-enum Type
-Values:
-
-
-enumerator FilePath
-
-
-
-
-enumerator AddressWithSize
-
-
-
-
-enumerator HostMemory
-
-
-
-
-
-
-
Public Functions
-
-
-inline explicit RawEngine ( std :: filesystem :: path enginePath ) noexcept
-
-
-
-
-inline explicit RawEngine (
-
-
-void const * engineAddr ,
-std :: size_t engineSize ,
-
-
-) noexcept
-
-
-
-
-inline explicit RawEngine (
-
-
-nvinfer1 :: IHostMemory const * engineBuffer ,
-
-
-) noexcept
-
-
-
-
-inline Type getType ( ) const
-
-
-
-
-inline std :: filesystem :: path getPath ( ) const
-
-
-
-
-inline std :: optional < std :: filesystem :: path > getPathOpt ( ) const
-
-
-
-
-inline void setPath ( std :: filesystem :: path enginePath )
-
-
-
-
-inline std :: optional < std :: map < std :: string , tensorrt_llm :: executor :: Tensor > > const & getManagedWeightsMapOpt (
-
-
-
-
-) const
-
-
-
-
-inline void setManagedWeightsMap (
-
-
-std :: map < std :: string , tensorrt_llm :: executor :: Tensor > managedWeightsMap ,
-
-
-)
-
-
-
-
-inline void const * getAddress ( ) const
-
-
-
-
-inline std :: size_t getSize ( ) const
-
-
-
-
-inline nvinfer1 :: IHostMemory const * getHostMemory ( ) const
-
-
-
-
-
Public Members
-
-
-void const * mEngineAddr = { }
-
-
-
-
-std :: size_t mEngineSize = { }
-
-
-
-
-
Private Members
-
-
-Type mType
-
-
-
-
-std :: optional < std :: filesystem :: path > mEnginePath
-
-
-
-
-struct tensorrt_llm::runtime::RawEngine
-
-
-
-
-nvinfer1 :: IHostMemory const * mEngineBuffer = { }
-
-
-
-
-std :: optional < std :: map < std :: string , tensorrt_llm :: executor :: Tensor > > mManagedWeightsMap
+
+bool mPointer
-
-
-
-
-
-
-loraModule.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
Functions
-
-
-inline std :: ostream & operator << (
-
-
-std :: ostream & output ,
-LoraModule const & module ,
-
-
-)
-
-
-
-
-class LoraModule
+
+template < typename T > class BufferRange : public tensorrt_llm :: common :: ArrayView < T >
Public Types
-
-
-enum class ModuleType : SizeType32
-Values:
-
-
-enumerator kINVALID
-
-
-
-
-enumerator kATTN_QKV
-
-
-
-
-enumerator kATTN_Q
-
-
-
-
-enumerator kATTN_K
-
-
-
-
-enumerator kATTN_V
-
-
-
-
-enumerator kATTN_DENSE
-
-
-
-
-enumerator kMLP_H_TO_4H
-
-
-
-
-enumerator kMLP_4H_TO_H
-
-
-
-
-enumerator kMLP_GATE
-
-
-
-
-enumerator kCROSS_ATTN_QKV
-
-
-
-
-enumerator kCROSS_ATTN_Q
-
-
-
-
-enumerator kCROSS_ATTN_K
-
-
-
-
-enumerator kCROSS_ATTN_V
-
-
-
-
-enumerator kCROSS_ATTN_DENSE
-
-
-
-
-enumerator kMOE_H_TO_4H
-
-
-
-
-enumerator kMOE_4H_TO_H
-
-
-
-
-enumerator kMOE_GATE
-
-
-
-
-enumerator kMOE_ROUTER
-
-
-
-
-enumerator kMLP_ROUTER
-
-
-
-
-enumerator kMLP_GATE_UP
-
-
-
-
-
-using TensorPtr = ITensor :: SharedPtr
+
+using Base = tensorrt_llm :: common :: ArrayView < T >
Public Functions
-
-inline explicit constexpr LoraModule (
+
+inline BufferRange ( T * data , size_type size )
+
+
+
+
+template < typename U = T , std :: enable_if_t < ! std :: is_const_v < U > , bool > = true > inline explicit BufferRange (
-ModuleType const & t ,
-SizeType32 inDim ,
-SizeType32 outDim ,
-bool inDimFirst ,
-bool outDimFirst ,
-SizeType32 inTpSplitDim ,
-SizeType32 outTpSplitDim ,
+IBuffer & buffer ,
-) noexcept
+)
-
-inline explicit constexpr LoraModule ( ) noexcept
-
-
-
-
-explicit constexpr LoraModule ( LoraModule const & o ) = default
-
-
-
-
-constexpr LoraModule & operator = ( LoraModule const & o ) = default
-
-
-
-
-inline SizeType32 constexpr flattenedInOutSize (
+
+template < typename U = T , std :: enable_if_t < std :: is_const_v < U > , bool > = true > inline explicit BufferRange (
-SizeType32 adapterSize ,
-bool isDora ,
+IBuffer const & buffer ,
-) const noexcept
+)
+
+
+
+
+
+
+
+template < nvinfer1 :: DataType kDataType , bool kIsUnsigned = false , bool kIsPointer = false > struct DataTypeTraits
+
+#include <iBuffer.h>
+For converting a TensorRT data type to a C++ data type.
+
+
+
+
+template < nvinfer1 :: DataType kDataType , bool kUnsigned > struct DataTypeTraits < kDataType , kUnsigned , true >
+
+
Public Types
+
+
+using type = typename DataTypeTraits < kDataType , kUnsigned , false > :: type *
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "*"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < bool kUnsigned > struct DataTypeTraits < nvinfer1 :: DataType :: kBOOL , kUnsigned >
+
+
Public Types
+
+
+using type = bool
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "bool"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kFLOAT >
+
+
Public Types
+
+
+using type = float
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "float"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kHALF >
+
+
Public Types
+
+
+using type = half
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "half"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT32 >
+
+
Public Types
+
+
+using type = std :: int32_t
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "int32"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT32 , true >
+
+
Public Types
+
+
+using type = std :: uint32_t
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "uint32"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT64 >
+
+
Public Types
+
+
+using type = std :: int64_t
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "int64"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT64 , true >
+
+
Public Types
+
+
+using type = std :: uint64_t
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "uint64"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT8 >
+
+
Public Types
+
+
+using type = std :: int8_t
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "int8"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+template < bool kUnsigned > struct DataTypeTraits < nvinfer1 :: DataType :: kUINT8 , kUnsigned >
+
+
Public Types
+
+
+using type = std :: uint8_t
+
+
+
+
+
Public Static Attributes
+
+
+static char constexpr name [ ] = "uint8"
+
+
+
+
+static auto constexpr size = sizeof ( type )
+
+
+
+
+
+
+
+class IBuffer
+Subclassed by tensorrt_llm::runtime::ITensor
+
+
Public Types
+
+
+using UniquePtr = std :: unique_ptr < IBuffer >
+
+
+
+
+using SharedPtr = std :: shared_ptr < IBuffer >
+
+
+
+
+using UniqueConstPtr = std :: unique_ptr < IBuffer const >
+
+
+
+
+using SharedConstPtr = std :: shared_ptr < IBuffer const >
+
+
+
+
+using DataType = nvinfer1 :: DataType
+
+
+
+
+
Public Functions
+
+
+virtual void * data ( ) = 0
+Returns a pointer to underlying array.
+
+
+
+
+virtual void const * data ( ) const = 0
+Returns a pointer to underlying array.
+
+
+
+
+inline virtual void * data ( std :: size_t index )
+Returns a pointer to the underlying array at a given element index.
+
+
+
+
+inline virtual void const * data ( std :: size_t index ) const
+Returns a pointer to the underlying array at a given element index.
+
+
+
+
+virtual std :: size_t getSize ( ) const = 0
+Returns the size (in number of elements) of the buffer.
+
+
+
+
+inline virtual std :: size_t getSizeInBytes ( ) const
+Returns the size (in bytes) of the buffer.
+
+
+
+
+virtual std :: size_t getCapacity ( ) const = 0
+Returns the capacity of the buffer.
+
+
+
+
+virtual DataType getDataType ( ) const = 0
+Returns the data type of the buffer.
+
+
+
+
+virtual char const * getDataTypeName ( ) const
-
-inline SizeType32 constexpr inSize (
+
+virtual MemoryType getMemoryType ( ) const = 0
+ Returns the memory type of the buffer.
+
-
-SizeType32 adapterSize ,
-
-
-
) const noexcept
+
+
+virtual char const * getMemoryTypeName ( ) const
-
-inline SizeType32 constexpr outSize (
+
+virtual void resize ( std :: size_t newSize ) = 0
+ Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+
-
-SizeType32 adapterSize ,
-
+
+
+virtual void release ( ) = 0
+Releases the buffer. It will be reset to nullptr.
+
-
) const noexcept
+
+
+virtual ~IBuffer ( ) = default
-
-inline SizeType32 constexpr localInSize (
-
-
-SizeType32 adapterSize ,
-SizeType32 tpSize ,
-
-
-) const noexcept
-
+
+IBuffer ( IBuffer const & ) = delete
+
Not allowed to copy.
+
-
-inline SizeType32 constexpr localOutSize (
-
-
-SizeType32 adapterSize ,
-SizeType32 tpSize ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localScalesSize (
-
-
-SizeType32 tpSize ,
-bool isDora ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localInDim (
-
-
-SizeType32 tpSize ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localOutDim (
-
-
-SizeType32 tpSize ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localInAdapterSize (
-
-
-SizeType32 adapterSize ,
-SizeType32 tpSize ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localOutAdapterSize (
-
-
-SizeType32 adapterSize ,
-SizeType32 tpSize ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localInOutSize (
-
-
-SizeType32 adapterSize ,
-SizeType32 tpSize ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr localTotalSize (
-
-
-SizeType32 adapterSize ,
-SizeType32 tpSize ,
-bool isDora ,
-
-
-) const noexcept
-
-
-
-
-inline SizeType32 constexpr value ( ) const noexcept
-
-
-
-
-inline std :: string_view constexpr name ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr inDim ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr outDim ( ) const noexcept
-
-
-
-
-inline bool constexpr inDimFirst ( ) const noexcept
-
-
-
-
-inline bool constexpr outDimFirst ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr inTpSplitDim ( ) const noexcept
-
-
-
-
-inline SizeType32 constexpr outTpSplitDim ( ) const noexcept
-
+
+IBuffer & operator = ( IBuffer const & ) = delete
+
Not allowed to copy.
+
Public Static Functions
-
-static std :: vector < LoraModule > createLoraModules (
-
-
-std :: vector < std :: string > const & loraModuleNames ,
-SizeType32 hiddenSize ,
-SizeType32 mlpHiddenSize ,
-SizeType32 numAttentionHeads ,
-SizeType32 numKvAttentionHeads ,
-SizeType32 attentionHeadSize ,
-SizeType32 tpSize ,
-SizeType32 numExperts ,
-
-
-)
+
+static char const * getDataTypeName ( DataType dataType )
-
-static inline ModuleType constexpr toModuleType (
+
+static UniquePtr slice (
-std :: string_view const & name ,
+SharedPtr buffer ,
+std :: size_t offset ,
+std :: size_t size ,
-)
-
-
-
-
-static inline std :: string_view constexpr toModuleName (
-
-
-ModuleType t ,
-
-
-) noexcept
-
-
-
-
-static inline std :: string_view constexpr toModuleName ( SizeType32 id )
-
-
-
-
-
-
-
-
-
-
-
-
-request.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-namespace decoder_batch
-
-
-class Request
-
-
-
-
-
-
-
-
-
-
-
-
-
-cudaStream.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class CudaStream
-
-
Public Functions
-
-
-inline explicit CudaStream (
-
-
-unsigned int flags = cudaStreamNonBlocking ,
-int priority = 0 ,
-
-
-)
-Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
+)
+Creates a sliced view on the underlying buffer . The view will have the same data type as buffer .
Parameters:
-flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
-priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
+buffer – The buffer to view.
+offset – The offset of the view.
+size – The size of the view.
+Returns:
+A view on the buffer .
+
-
-inline explicit CudaStream (
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
-cudaStream_t stream ,
-int device ,
-bool ownsStream = true ,
+TConstPtr & & tensor ,
+std :: size_t offset ,
+std :: size_t size ,
-)
-Pass an existing cuda stream to this object.
+)
+
+
+
+
+static inline UniquePtr slice ( SharedPtr buffer , std :: size_t offset )
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+std :: size_t offset ,
+
+
+)
+
+
+
+
+static inline UniquePtr view ( SharedPtr tensor )
+Returns a view on the underlying tensor which can be independently resized.
+
+Parameters:
+tensor – The tensor to view.
+
+Returns:
+A view on the tensor .
+
+
+
+
+
+
+static inline UniquePtr view ( SharedPtr tensor , std :: size_t size )
+Returns a view on the underlying tensor with a different size.
Parameters:
-stream – The stream to pass to this object.
-device – The device on which the stream was created.
-ownsStream – Whether this object owns the stream and destroys it in the destructor.
+tensor – The tensor to view.
+size – The size of the view.
-
-
-
-
-
-inline explicit CudaStream ( cudaStream_t stream )
-Construct with an existing cuda stream or the default stream by passing nullptr.
-
-
-
-
-inline int getDevice ( ) const
-Returns the device on which the stream was created.
-
-
-
-
-inline cudaStream_t get ( ) const
-Returns the stream associated with this object.
-
-
-
-
-inline void synchronize ( ) const
-Synchronizes the stream.
-
-
-
-
-inline void record ( CudaEvent :: pointer event ) const
-Record an event on the stream.
-
-
-
-
-inline void record ( CudaEvent const & event ) const
-Record an event on the stream.
-
-
-
-
-inline void wait ( CudaEvent :: pointer event ) const
-Wait for an event.
-
-
-
-
-inline void wait ( CudaEvent const & event ) const
-Wait for an event.
-
-
-
-
-
Private Types
-
-
-using StreamPtr = std :: unique_ptr < std :: remove_pointer_t < cudaStream_t > , Deleter >
-
-
-
-
-
Private Members
-
-
-StreamPtr mStream
-
-
-
-
-int mDevice = { - 1 }
-
-
-
-
-
-class Deleter
-
-
Public Functions
-
-
-inline explicit Deleter ( bool ownsStream )
-
-
-
-
-inline explicit Deleter ( )
-
-
-
-
-inline constexpr void operator () ( cudaStream_t stream ) const
-
-
-
-
-
Private Members
-
-
-bool mOwnsStream
-
-
-
-
-
-
-
-
-
-
-
-
-
-cudaEvent.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class CudaEvent
-
-
Public Types
-
-
-using pointer = cudaEvent_t
-
-
-
-
-
Public Functions
-
-
-inline explicit CudaEvent ( unsigned int flags = cudaEventDisableTiming )
-Creates a new cuda event. The event will be destroyed in the destructor.
-
-Parameters:
-flags – Flags for event creation. By default, event timing is disabled.
+Returns:
+A view on the tensor .
-
-inline explicit CudaEvent ( pointer event , bool ownsEvent = true )
-Pass an existing cuda event to this object.
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr view (
+
+
+TConstPtr & & tensor ,
+std :: size_t size ,
+
+
+)
+
+
+
+
+static UniquePtr wrap (
+
+
+void * data ,
+DataType type ,
+std :: size_t size ,
+std :: size_t capacity ,
+
+
+)
+Wraps the given data in an IBuffer . The IBuffer will not own the underlying data and cannot be resized beyond capacity .
Parameters:
-event – The event to pass to this object.
-ownsEvent – Whether this object owns the event and destroys it in the destructor.
+data – The data to wrap.
+type – The data type of the data .
+size – The size of the buffer.
+capacity – The capacity of the buffer.
+Returns:
+An IBuffer .
+
-
-inline pointer get ( ) const
-Returns the event associated with this object.
-
+
+static inline UniquePtr wrap (
+
+
+void * data ,
+DataType type ,
+std :: size_t size ,
+
+
+
)
+
-
-inline void synchronize ( ) const
-Synchronizes the event.
+
+template < typename T > static inline UniquePtr wrap (
+
+
+T * data ,
+std :: size_t size ,
+std :: size_t capacity ,
+
+
+)
+
+
+
+
+template < typename T > static inline UniquePtr wrap (
+
+
+T * data ,
+std :: size_t size ,
+
+
+)
+
+
+
+
+template < typename T > static inline UniquePtr wrap (
+
+
+std :: vector < T > & v ,
+
+
+)
+
+
+
+
+static MemoryType memoryType ( void const * data )
+Determine the memory type of a pointer.
-
Private Types
-
-
-using element_type = std :: remove_pointer_t < pointer >
+Protected Functions
+
+
+IBuffer ( ) = default
-
-
-using EventPtr = std :: unique_ptr < element_type , Deleter >
-
+
+
+inline std :: size_t toBytes ( std :: size_t size ) const
+Returns an array index or size in bytes.
+
-
-
Private Members
-
-
-EventPtr mEvent
+
+
+
+
+template < MemoryType T > struct MemoryTypeString
-
-
-
-class Deleter
+
+
+template < > struct MemoryTypeString < MemoryType :: kCPU >
-
Public Functions
-
-
-inline explicit Deleter ( bool ownsEvent )
-
-
-
-
-inline explicit Deleter ( )
-
-
-
-
-inline constexpr void operator () ( pointer event ) const
-
-
-
-
-
Private Members
+
Public Static Attributes
-
-bool mOwnsEvent
+
+static auto constexpr value = "CPU"
+
+
+template < > struct MemoryTypeString < MemoryType :: kGPU >
+
+
Public Static Attributes
+
+
+static auto constexpr value = "GPU"
+
+
+
+
+
+
+
+template < > struct MemoryTypeString < MemoryType :: kPINNED >
+
+
Public Static Attributes
+
+
+static auto constexpr value = "PINNED"
+
+
+
+
+
+
+
+template < > struct MemoryTypeString < MemoryType :: kPINNEDPOOL >
+
+
Public Static Attributes
+
+
+static auto constexpr value = "PINNEDPOOL"
+
+
+
+
+
+
+
+template < > struct MemoryTypeString < MemoryType :: kUVM >
+
+
Public Static Attributes
+
+
+static auto constexpr value = "UVM"
+
+
+
+
+
+
+
+template < typename T , bool = false > struct TRTDataType
+
+#include <iBuffer.h>
+For converting a C++ data type to a TensorRT data type.
+
+
+
+
+template < > struct TRTDataType < bool >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kBOOL
+
+
+
+
+
+
+
+template < > struct TRTDataType < float >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kFLOAT
+
+
+
+
+
+
+
+template < > struct TRTDataType < half >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kHALF
+
+
+
+
+
+
+
+template < > struct TRTDataType < kernels :: FinishedState >
+
+
Public Static Attributes
+
+
+static constexpr auto value = TRTDataType < kernels :: FinishedState :: UnderlyingType > :: value
+
+
+
+
+
+
+
+template < > struct TRTDataType < kernels :: KVCacheIndex >
+
+
Public Static Attributes
+
+
+static constexpr auto value = TRTDataType < kernels :: KVCacheIndex :: UnderlyingType > :: value
+
+
+
+
+
+
+
+template < > struct TRTDataType < runtime :: RequestType >
+
+
Public Static Attributes
+
+
+static constexpr auto value = TRTDataType < std :: underlying_type_t < runtime :: RequestType > > :: value
+
+
+
+
+
+
+
+template < > struct TRTDataType < std :: int32_t >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kINT32
+
+
+
+
+
+
+
+template < > struct TRTDataType < std :: int64_t >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kINT64
+
+
+
+
+
+
+
+template < > struct TRTDataType < std :: int8_t >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kINT8
+
+
+
+
+
+
+
+template < > struct TRTDataType < std :: uint32_t >
+
+
Public Static Attributes
+
+
+static constexpr auto value = BufferDataType { nvinfer1 :: DataType :: kINT32 , true }
+
+
+
+
+
+
+
+template < > struct TRTDataType < std :: uint64_t >
+
+
Public Static Attributes
+
+
+static constexpr auto value = BufferDataType { nvinfer1 :: DataType :: kINT64 , true }
+
+
+
+
+
+
+
+template < > struct TRTDataType < std :: uint8_t >
+
+
Public Static Attributes
+
+
+static constexpr auto value = nvinfer1 :: DataType :: kUINT8
+
+
+
+
+
+
+
+template < typename T > struct TRTDataType < T * >
+
+
+
Private Static Attributes
+
+
+static auto constexpr kUnderlyingType = BufferDataType { TRTDataType < std :: remove_const_t < T > , false > :: value }
+
+
+
+
+
+
+
+template < > struct TRTDataType < void * >
+
@@ -7685,151 +3449,392 @@
-
-ipcNvlsMemory.h
+
+decodingOutput.h
+
+
+namespace tensorrt_llm
+
+
+namespace batch_manager
+
+
+
+
+namespace runtime
+
+
+class DecodingOutput
+
+
+
Public Functions
+
+
+DecodingOutput ( ) = default
+
+
+
+
+
Public Members
+
+
+TensorPtr ids
+Mandatory parameters Previously generated token ids for all steps before DecodingInput.step , [BS, BM, MSL]
+
+
+
+
+TensorPtr gatheredIds
+The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids .
+
+
+
+
+TensorPtr newTokensSteps
+New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].
+
+
+
+
+TensorPtr newTokens
+A view of newTokensSteps for the current token, [BS, BM].
+
+
+
+
+std :: vector < TensorPtr > newTokensVec
+A Vector of views on newTokensSteps for each token [BS, BM].
+
+
+
+
+TensorPtr finishReasons
+Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]
+
+
+
+
+TensorPtr finishedSum
+The sum of finished sequences per request, in pinned memory, [BS].
+
+
+
+
+TensorPtr logProbs
+Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float
+
+
+
+
+TensorPtr cumLogProbs
+Sum log-probility of all generated tokens, [BS, BM].
+
+
+
+
+TensorPtr parentIds
+Index of the beam where the previous token is, [BS, BM, MSL].
+
+
+
+
+TensorPtr lengths
+Total sequence lengths including padding, [BS, BM].
+
+
+
+
+TensorPtr cacheIndirection
+K/V indirection for next generation step, [BS, BM, MSL].
+
+
+
+
+TensorPtr logProbsTiled
+Buffer used to store the transpose of the logProbs, [MSL, BS, BM].
+
+
+
+
+BeamHypotheses beamHypotheses
+
+
+
+
+std :: optional < SpeculativeDecodingOutputs > speculativeDecodingOutputs
+
+
+
+
+std :: optional < ExplicitDraftTokensBuffers :: Inputs > explicitDraftTokensBuffers
+
+
+
+
+std :: optional < LookaheadDecodingBuffers > lookaheadOutputs
+
+
+
+
+std :: optional < EagleBuffers :: Inputs > eagleBuffers
+
+
+
+
+
Public Static Attributes
+
+
+static float constexpr kNegativeInfinity = - 1e20f
+
+
+
+
+
+class BeamHypotheses
+
+
+
+
+
+
+class SpeculativeDecodingOutputs
+
+
+
+
+
+
+
+
+
+
+
+promptTuningParams.h
namespace tensorrt_llm
namespace runtime
+
+
+template < typename TTensor > class GenericPromptTuningParams
-
Functions
-
-
-void MPI_group_barrier ( std :: set < int > ranks )
+Public Types
+
+
+using TensorPtr = TTensor
-
-
-bool ipcNvlsSupported ( )
-
-
-
-
-IpcNvlsHandle * ipcNvlsAllocate ( size_t size , std :: set < int > ranks )
-
-
-
-
-void ipcNvlsFree ( IpcNvlsHandle * handle )
-
-
-
-
-
-template < typename T > class DeviceAllocationNvls
-
-
Public Functions
-
-
-DeviceAllocationNvls ( ) = default
-
-
-
-
-inline ~DeviceAllocationNvls ( )
-
-
-
-
-inline void reset ( size_t size , std :: set < int > ranks )
-
-
-
-
-inline T * getMulticastPointer ( ) const
-
-
-
-
-inline T * getUnicastPointer ( ) const
-
-
-
-
-inline T * * getIpcUnicastPointers ( )
-
-
-
-
-inline size_t getCapacity ( ) const
-
-
-
-
-inline void free ( )
+
+
+using SizeType32 = tensorrt_llm :: runtime :: SizeType32
+
-
-
-struct IpcNvlsHandle
+
+
+class PromptTuningParams : public tensorrt_llm :: runtime :: GenericPromptTuningParams < ITensor :: SharedPtr >
+
+
Public Functions
+
+
+inline explicit PromptTuningParams (
+
+
+TensorPtr embeddingTable = nullptr ,
+TensorPtr tasks = nullptr ,
+TensorPtr vocabSize = nullptr ,
+
+
+)
-
-
-std :: vector < uintptr_t > ipc_uc_ptrs
-
+
+
+void fillTasksTensor (
-
-
-CUdeviceptr uc_va
-
+
+TensorPtr tasksHost ,
+SizeType32 batchSize ,
+SizeType32 numContextRequests ,
+std :: vector < SizeType32 > const & reqBeamWidths ,
+std :: vector < SizeType32 > const & reqPromptLengths ,
+BufferManager const & manager ,
+bool packedInput ,
+
-
-
-CUdeviceptr mc_va
-
-
-
-
-std :: vector < CUdeviceptr > ipc_uc_vas
-
-
-
-
-CUmemGenericAllocationHandle uc_handle
-
-
-
-
-CUmemGenericAllocationHandle mc_handle
-
-
-
-
-std :: vector < CUmemGenericAllocationHandle > ipc_uc_handles
+)
@@ -7840,743 +3845,438 @@
-
-iTensor.h
-
-
-namespace nvinfer1
-
-
+
+bufferManager.h
namespace tensorrt_llm
namespace runtime
-
-
Functions
-
-
-inline std :: ostream & operator << (
-
-
-std :: ostream & output ,
-ITensor :: Shape const & dims ,
-
-
-)
-Utility function to print a shape.
-
-
-
-
-std :: ostream & operator << (
-
-
-std :: ostream & output ,
-ITensor const & tensor ,
-
-
-)
-Utility function to print a tensor with its shape.
-
-
-
-
-template < typename T > T const * bufferCastOrNull (
-
-
-ITensor :: SharedConstPtr const & tensorPtr ,
-
-
-)
-Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-tensorPtr – A possibly null shared ptr.
-
-Returns:
-A pointer to T const, possibly nullptr.
-
-
-
-
-
-
-template < typename T > T * bufferCastOrNull (
-
-
-ITensor :: SharedPtr const & tensorPtr ,
-
-
-)
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-tensorPtr – A possibly null shared ptr.
-
-Returns:
-A pointer to T, possibly nullptr.
-
-
-
-
-
-
-template < typename T > T * bufferCastOrNull (
-
-
-std :: optional < ITensor :: SharedPtr > const & optionalTensorPtr ,
-
-
-)
-Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-optionalBufferPtr – A possibly empty optional.
-
-Returns:
-A pointer to T, possibly nullptr.
-
-
-
-
-
-
-template < typename T > T const * bufferCastOrNull (
-
-
-std :: optional < ITensor :: SharedConstPtr > const & optionalTensorPtr ,
-
-
-)
-Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-optionalBufferPtr – A possibly empty optional.
-
-Returns:
-A pointer to const T, possibly nullptr.
-
-
-
-
-
-
-
-class ITensor : public virtual tensorrt_llm :: runtime :: IBuffer
-
+
+
+class BufferManager
+
+#include <bufferManager.h>
+A helper class for managing memory on host and device.
+
Public Types
-
-using UniquePtr = std :: unique_ptr < ITensor >
+
+using IBufferPtr = IBuffer :: UniquePtr
-
-using SharedPtr = std :: shared_ptr < ITensor >
+
+using ITensorPtr = ITensor :: UniquePtr
-
-using UniqueConstPtr = std :: unique_ptr < ITensor const >
+
+using CudaStreamPtr = std :: shared_ptr < CudaStream >
-
-using SharedConstPtr = std :: shared_ptr < ITensor const >
-
-
-
-
-using Shape = nvinfer1 :: Dims
-
-
-
-
-using DimType64 = std :: remove_reference_t < decltype ( Shape :: d [ 0 ] ) >
-
-
-
-
-using TensorMap = runtime :: StringPtrMap < runtime :: ITensor >
+
+using CudaMemPoolPtr = std :: shared_ptr < CudaMemPool >
Public Functions
-
-~ITensor ( ) override = default
-
-
-
-
-virtual Shape const & getShape ( ) const = 0
-Returns the tensor dimensions.
+
+explicit BufferManager ( CudaStreamPtr stream , bool trimPool = false )
+Construct a BufferManager .
+
+Parameters:
+cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
+
+
-
-template < SizeType32 n > inline DimType64 getDimension ( ) const
-Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
+
+inline ~BufferManager ( )
+Destructor.
-
-virtual void reshape ( Shape const & dims ) = 0
-Sets the tensor dimensions. The new size of the tensor will be volume(dims)
-
-
-
-
-inline virtual void resize ( std :: size_t newSize ) override
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
-
-
-
-ITensor ( ITensor const & ) = delete
-Not allowed to copy.
-
-
-
-
-ITensor & operator = ( ITensor const & ) = delete
-Not allowed to copy.
-
-
-
-
-inline void squeeze ( SizeType32 dim )
-Removes the given unit dimensions from this tensor.
-
-
-
-
-inline void unsqueeze ( SizeType32 dim )
-Adds a unit dimension at the specified position.
-
-
-
-
-inline bool shapeEquals ( Shape const & other ) const
-
-
-
-
-inline bool shapeEquals (
+
+IBufferPtr gpu (
-std :: initializer_list < SizeType32 > const & other ,
+std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-) const
-
+
) const
+
Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.
+
-
-template < typename T > inline bool shapeEquals (
+
+ITensorPtr gpu (
-T const * dims ,
-SizeType32 count ,
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-) const
-
+
) const
+
Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.
+
+
+
+
+IBufferPtr allocate (
+
+
+MemoryType memoryType ,
+std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
+
+
+) const
+Allocates an IBuffer of the given size and memory type.
+
+
+
+
+ITensorPtr allocate (
+
+
+MemoryType memoryType ,
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
+
+
+) const
+Allocates an ITensor of the given dimensions and memory type.
+
+
+
+
+inline IBufferPtr emptyBuffer (
+
+
+MemoryType memoryType ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
+
+
+) const
+Create an empty IBuffer of the given memory type. It may be resized later.
+
+
+
+
+inline ITensorPtr emptyTensor (
+
+
+MemoryType memoryType ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
+
+
+) const
+Create an empty ITensor of the given memory type. It may be reshaped later.
+
+
+
+
+void setMem ( IBuffer & buffer , int32_t value ) const
+Set the contents of the given buffer to value.
+
+
+
+
+void setZero ( IBuffer & buffer ) const
+Set the contents of the given buffer to zero.
+
+
+
+
+void copy ( void const * src , IBuffer & dst , MemoryType srcType ) const
+Copy src to dst .
+
+
+
+
+void copy ( IBuffer const & src , void * dst , MemoryType dstType ) const
+Copy src to dst .
+
+
+
+
+inline void copy ( void const * src , IBuffer & dst ) const
+Copy src to dst .
+
+
+
+
+inline void copy ( IBuffer const & src , void * dst ) const
+Copy src to dst .
+
+
+
+
+void copy ( IBuffer const & src , IBuffer & dst ) const
+Copy src to dst .
+
+
+
+
+IBufferPtr copyFrom ( IBuffer const & src , MemoryType memoryType ) const
+Copy src into a new IBuffer with a potentially different memory type.
+
+
+
+
+ITensorPtr copyFrom ( ITensor const & src , MemoryType memoryType ) const
+Copy src into a new ITensor with a potentially different memory type.
+
+
+
+
+template < typename T > inline IBufferPtr copyFrom (
+
+
+std :: vector < T > const & src ,
+MemoryType memoryType ,
+
+
+) const
+Copy src into a new IBuffer with a potentially different memory type.
+
+
+
+
+template < typename T > inline ITensorPtr copyFrom (
+
+
+T * src ,
+nvinfer1 :: Dims dims ,
+MemoryType memoryType ,
+
+
+) const
+Copy src into a new ITensor with a potentially different memory type.
+
+
+
+
+template < typename T > inline ITensorPtr copyFrom (
+
+
+std :: vector < T > const & src ,
+nvinfer1 :: Dims dims ,
+MemoryType memoryType ,
+
+
+) const
+Copy src into a new ITensor with a potentially different memory type.
+
+
+
+
+CudaStream const & getStream ( ) const
+Get the underlying cuda stream.
+
+
+
+
+std :: size_t memoryPoolReserved ( ) const
+The current size of the memory reserved by the memory pool.
+
+
+
+
+std :: size_t memoryPoolUsed ( ) const
+The current size of the memory used by the memory pool.
+
+
+
+
+std :: size_t memoryPoolFree ( ) const
+The current size of the memory free in the memory pool.
+
+
+
+
+void memoryPoolTrimTo ( std :: size_t size )
+Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.
+
Public Static Functions
-
-static inline std :: int64_t volume ( Shape const & dims )
-Returns the volume of the dimensions. Returns -1 if d.nbDims < 0 .
-
-
-
-
-static inline std :: size_t volumeNonNegative ( Shape const & shape )
-Returns the volume of the dimensions. Throws if d.nbDims < 0 .
-
-
-
-
-static inline Shape strides ( Shape const & dims )
-Returns the strides of each dimemsion in a Shape.
-
-
-
-
-static Shape squeeze ( Shape const & shape , SizeType32 dim )
-Removes the given unit dimension from shape .
-
-Parameters:
-
-
-Returns:
-A new shape without the unit dimension.
-
-
-
-
-
-
-static Shape unsqueeze ( Shape const & shape , SizeType32 dim )
-Add a unit dimension to shape at the specified position.
-
-Parameters:
-
-
-Returns:
-A new shape with the added unit dimension.
-
-
-
-
-
-
-static UniquePtr slice (
+
+static IBufferPtr gpuSync (
-SharedPtr tensor ,
-std :: size_t offset ,
std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-Creates a sliced view on the underlying tensor . The view will have the same data type as tensor .
-
-Parameters:
-
-tensor – The tensor to view.
-offset – The offset of the view w.r.t. dimension 0 of the tensor.
-size – The size of the view w.r.t. dimension 0 of the tensor.
-
-
-Returns:
-A view on the buffer .
-
-
+)
+Allocates an IBuffer of the given size on the GPU, using cudaMalloc.
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+static ITensorPtr gpuSync (
+
+
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
+
+
+)
+Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.
+
+
+
+
+static IBufferPtr cpu (
-TConstPtr & & tensor ,
-std :: size_t offset ,
std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
-
-
-
-static inline UniquePtr slice ( SharedPtr tensor , std :: size_t offset )
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
-
-
-TConstPtr & & tensor ,
-std :: size_t offset ,
-
-
-)
-
-
-
-
-static UniquePtr slice (
-
-
-SharedPtr tensor ,
-Shape const & offsetDims ,
-DimType64 size ,
-
-
-)
-
-Parameters:
-
-offsetDims – The offset in multiple dimensions.
-tensor – The tensor to view.
-offsetDims – The offset dimensions of the view.
-size – The size of the view w.r.t. the last dimension in offsetDims.
-offsetDims – specifies all dimensions.
-
-
-Throws:
-Whenever – offset overflows or the last dimension offset+size overflows.
-
-Returns:
-A view of shape [size, the rest dimensions] or [size] when
-
-
+)
+Allocates an IBuffer of the given size on the CPU.
-
-static inline UniquePtr slice (
+
+static ITensorPtr cpu (
-SharedPtr tensor ,
-std :: initializer_list < DimType64 > const & offsetDims ,
-DimType64 size ,
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
+
)
+
Allocates an ITensor of the given dimensions on the CPU.
+
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+static IBufferPtr pinned (
-TConstPtr & & tensor ,
-Shape const & offsetDims ,
std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
+
)
+
Allocates a pinned IBuffer of the given size on the CPU.
+
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+static ITensorPtr pinned (
+
+
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
+
+
+)
+Allocates a pinned ITensor of the given dimensions on the CPU.
+
+
+
+
+static IBufferPtr pinnedPool (
-TConstPtr & & tensor ,
-std :: initializer_list < DimType64 > const & offsetDims ,
std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
-
-
-
-static inline UniquePtr slice (
-
-
-SharedPtr tensor ,
-Shape const & offsetDims ,
-
-
-)
-return the rest slices at the last dimension when size omitted.
+)
+Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.
-
-static inline UniquePtr slice (
+
+static ITensorPtr pinnedPool (
-SharedPtr tensor ,
-std :: initializer_list < DimType64 > const & offsetDims ,
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
-
-
-TConstPtr & & tensor ,
-Shape const & offsetDims ,
-
-
-)
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
-
-
-TConstPtr & & tensor ,
-std :: initializer_list < DimType64 > const & offsetDims ,
-
-
-)
-
-
-
-
-static inline UniquePtr at ( SharedPtr tensor , Shape const & offsetDims )
-
-Parameters:
-offsetDims – specifies all dimensions.
-
-Returns:
-Just the block at the point, with shape of [the rest dimensions] or [1] when
-
-
+)
+Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.
-
-static inline UniquePtr at (
+
+static IBufferPtr managed (
-SharedPtr tensor ,
-std :: initializer_list < DimType64 > const & offsetDims ,
+std :: size_t size ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr at (
-
-
-TConstPtr & & tensor ,
-Shape const & offsetDims ,
-
-
-)
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline ITensor :: UniqueConstPtr at (
-
-
-TConstPtr & & tensor ,
-std :: initializer_list < DimType64 > const & offsetDims ,
-
-
-)
-
-
-
-
-static UniquePtr view ( IBuffer :: SharedPtr buffer , Shape const & dims )
-Returns a view on the underlying buffer (or tensor) with the given shape.
-
-Parameters:
-
-
-Returns:
-A view on the tensor .
-
-
+)
+Allocates an IBuffer of the given size in UVM.
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr view (
+
+static ITensorPtr managed (
-TConstPtr & & tensor ,
-Shape const & dims ,
+nvinfer1 :: Dims dims ,
+nvinfer1 :: DataType type = kBYTE_TYPE ,
-)
-
-
-
-
-static inline UniquePtr view ( SharedPtr tensor )
-Returns a view on the underlying tensor which can be independently reshaped.
-
-Parameters:
-tensor – The tensor to view.
-
-Returns:
-A view on the tensor .
-
-
+)
+Allocates an ITensor of the given dimensions in UVM.
-
-static inline UniquePtr flattenN (
+
+static ITensorPtr ipcNvls (
-SharedPtr tensor ,
-std :: int64_t sliceN = - 1 ,
-
-
-)
-Returns a flattened view on the underlying tensor which can be independently reshaped.
-
-Parameters:
-
-
-Returns:
-A flatten view on the tensor .
-
-
-
-
-
-
-static UniquePtr wrap (
-
-
-void * data ,
+std :: set < int > ranks ,
+nvinfer1 :: Dims dims ,
nvinfer1 :: DataType type ,
-Shape const & shape ,
-std :: size_t capacity ,
-)
-Wraps the given data in an ITensor . The ITensor will not own the underlying data and cannot be reshaped beyond capacity .
-
-Parameters:
-
-data – The data to wrap.
-type – The data type of the data .
-shape – The shape of the tensor.
-capacity – The capacity of the buffer.
-
-
-Returns:
-An ITensor .
-
-
-
-
-
-
-static inline UniquePtr wrap (
-
-
-void * data ,
-nvinfer1 :: DataType type ,
-Shape const & shape ,
-
-
-)
-
-
-
-
-template < typename T > static inline UniquePtr wrap (
-
-
-T * data ,
-Shape const & shape ,
-std :: size_t capacity ,
-
-
-)
-
-
-
-
-template < typename T > static inline UniquePtr wrap (
-
-
-T * data ,
-Shape const & shape ,
-
-
-)
-
-
-
-
-template < typename T > static inline UniquePtr wrap (
-
-
-std :: vector < T > & v ,
-Shape const & shape ,
-
-
-)
-
-
-
-
-static Shape makeShape (
-
-
-std :: initializer_list < DimType64 > const & dims ,
-
-
-)
-A convenience function to create a tensor shape with the given dimensions.
-
-
-
-
-static std :: string toString ( Shape const & dims )
-A convenience function for converting a tensor shape to a string .
-
-
-
-
-static inline bool shapeEquals ( Shape const & lhs , Shape const & rhs )
-A convenience function to compare shapes.
-
-
-
-
-template < typename T > static inline bool shapeEquals (
-
-
-Shape const & lhs ,
-T const * dims ,
-SizeType32 count ,
-
-
-)
-A convenience function to compare shapes.
+)
+Allocates an ITensor of the given dimensions for NVLS.
-
Protected Functions
-
-
-ITensor ( ) = default
+Public Static Attributes
+
+
+static auto constexpr kBYTE_TYPE = nvinfer1 :: DataType :: kUINT8
-
Protected Static Functions
-
-
-static inline DimType64 castSize ( size_t newSize )
+Private Members
+
+
+CudaStreamPtr mStream
+
+
+
+
+CudaMemPoolPtr mPool
+
+
+
+
+bool const mTrimPool
@@ -8584,7 +4284,7 @@
Friends
-friend class ITensorBindings
+friend class ::BufferManagerTest
@@ -8595,198 +4295,8 @@
-
-gptDecoderBatched.h
-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
-
-
-
-namespace runtime
-
-
-class GptDecoderBatched : public tensorrt_llm :: runtime :: IGptDecoderBatched
-
-#include <gptDecoderBatched.h>
-GPT decoder class with support for in-flight batching.
-
-
-
Public Functions
-
-
-explicit GptDecoderBatched ( CudaStreamPtr stream )
-
-
-
-
-virtual void setup (
-
-
-executor :: DecodingMode const & mode ,
-SizeType32 maxBatchSize ,
-SizeType32 maxBeamWidth ,
-nvinfer1 :: DataType dtype ,
-ModelConfig const & modelConfig ,
-WorldConfig const & worldConfig ,
-
-
-) override
-Setup the decoder before calling forward()
-
-
-
-
-RequestVector const & genRequests ,
-TensorPtr const & batchSlots ,
-
-
-) override
-Disable Lookahead decoding.
-
-
-
-
-virtual CudaEvent forwardAsync (
-
-
-decoder :: DecoderState const & decoderState ,
-decoder_batch :: Input const & input ,
-
-
-) override
-Run one step for all requests without blocking the host process and return the token for synchronization.
-
-
-
-
-virtual void forward (
-
-
-decoder :: DecoderState const & decoderState ,
-decoder_batch :: Input const & input ,
-
-
-) override
-Run one step for all requests and wait for completion on the host.
-
-
-
-
-virtual CudaEvent finalize (
-
-
-decoder :: DecoderState const & decoderState ,
-SizeType32 batchSlot ,
-SamplingConfig const & samplingConfig ,
-bool streaming ,
-
-
-) const override
-Gather final beam search results for request batchSlot . Result will only be available after event returned.
-
-
-
-
-inline CudaStreamPtr getDecoderStream ( ) const
-
-
-
-
-inline IGptDecoder & getUnderlyingDecoder ( ) const
-
-
-
-
-inline BufferManager const & getBufferManager ( ) const
-
-
-
-
-
Private Types
-
-
-using GptDecoderPtr = std :: unique_ptr < IGptDecoder >
-
-
-
-
-
Private Functions
-
-
-void forwardDispatch (
-
-
-decoder :: DecoderState const & decoderState ,
-decoder_batch :: Input const & input ,
-
-
-)
-Calls decoders for tokens per engine step.
-
-
-
-
-
-
-
-
-
-
-
-
-eagleModule.h
+
+gptJsonConfig.h
namespace tensorrt_llm
@@ -8794,65 +4304,222 @@
namespace runtime
-
-class EagleModule : public tensorrt_llm :: runtime :: SpeculativeDecodingModule
+
+class GptJsonConfig
Public Functions
-
-inline explicit EagleModule (
+
+inline GptJsonConfig (
-SizeType32 maxDraftPathLen ,
-SizeType32 maxDecodingDraftTokens ,
-SizeType32 numTransformersLayer ,
-SizeType32 maxNonLeafNodesPerLayer ,
+std :: string name ,
+std :: string version ,
+std :: string precision ,
+SizeType32 tensorParallelism ,
+SizeType32 pipelineParallelism ,
+SizeType32 contextParallelism ,
+SizeType32 gpusPerNode ,
+ModelConfig modelConfig ,
+std :: optional < RuntimeDefaults > runtimeDefaults = std :: nullopt ,
-) noexcept
+)
-
-inline explicit EagleModule ( ) noexcept
+
+inline ModelConfig const & getModelConfig ( ) const
-
-inline executor :: EagleChoices const & getDefaultEagleChoices (
+
+inline ModelConfig & getModelConfigMutable ( )
+
+
+
+
+inline std :: string const & getName ( ) const
+
+
+
+
+inline std :: string const & getVersion ( ) const
+
+
+
+
+inline std :: string const & getPrecision ( ) const
+
+
+
+
+inline SizeType32 constexpr getTensorParallelism ( ) const
+
+
+
+
+inline SizeType32 constexpr getPipelineParallelism ( ) const
+
+
+
+
+inline SizeType32 constexpr getContextParallelism ( ) const
+
+
+
+
+inline SizeType32 constexpr getGpusPerNode ( ) const
+
+
+
+
+inline SizeType32 constexpr getWorldSize ( ) const
+
+
+
+
+inline std :: optional < RuntimeDefaults > getRuntimeDefaults ( ) const
+
+
+
+
+std :: string engineFilename (
+WorldConfig const & worldConfig ,
+std :: string const & model ,
-) const noexcept
+) const
-
-inline SizeType32 getNumTransformerLayers ( ) const noexcept
+
+inline std :: string engineFilename (
+
+
+WorldConfig const & worldConfig ,
+
+
+) const
+
+
+
+
+
Public Static Functions
+
+
+static GptJsonConfig parse ( std :: string const & json )
-
-inline SizeType32 getMaxNonLeafNodesPerLayer ( ) const noexcept
+
+static GptJsonConfig parse ( std :: istream & json )
+
+
+
+
+static GptJsonConfig parse ( std :: filesystem :: path const & path )
Private Members
-
-SizeType32 mNumTransformersLayer
+
+std :: string const mName
-
-SizeType32 mMaxNonLeafNodesPerLayer
+
+std :: string const mVersion
-
-executor :: EagleChoices mDefaultEagleChoices = { { 0 } , { 0 , 0 } , { 1 } , { 0 , 1 } , { 2 } , { 0 , 0 , 0 } , { 1 , 0 } , { 0 , 2 } , { 3 } , { 0 , 3 } , { 4 } , { 0 , 4 } , { 2 , 0 } , { 0 , 5 } , { 0 , 0 , 1 } , { 5 } , { 0 , 6 } , { 6 } , { 0 , 7 } , { 0 , 1 , 0 } , { 1 , 1 } , { 7 } , { 0 , 8 } , { 0 , 0 , 2 } , { 3 , 0 } , { 0 , 9 } , { 8 } , { 9 } , { 1 , 0 , 0 } , { 0 , 2 , 0 } , { 1 , 2 } , { 0 , 0 , 3 } , { 4 , 0 } , { 2 , 1 } , { 0 , 0 , 4 } , { 0 , 0 , 5 } , { 0 , 0 , 0 , 0 } , { 0 , 1 , 1 } , { 0 , 0 , 6 } , { 0 , 3 , 0 } , { 5 , 0 } , { 1 , 3 } , { 0 , 0 , 7 } , { 0 , 0 , 8 } , { 0 , 0 , 9 } , { 6 , 0 } , { 0 , 4 , 0 } , { 1 , 4 } , { 7 , 0 } , { 0 , 1 , 2 } , { 2 , 0 , 0 } , { 3 , 1 } , { 2 , 2 } , { 8 , 0 } , { 0 , 5 , 0 } , { 1 , 5 } , { 1 , 0 , 1 } , { 0 , 2 , 1 } , { 9 , 0 } , { 0 , 6 , 0 } , { 0 , 0 , 0 , 1 } , { 1 , 6 } , { 0 , 7 , 0 } }
+
+std :: string const mPrecision
+
+
+
+
+SizeType32 const mTensorParallelism
+
+
+
+
+SizeType32 const mPipelineParallelism
+
+
+
+
+SizeType32 const mContextParallelism
+
+
+
+
+SizeType32 const mGpusPerNode
+
+
+
+
+ModelConfig mModelConfig
+
+
+
+
+std :: optional < RuntimeDefaults > mRuntimeDefaults
+
+
+
+
+
+
+
+
+
+
+
+runtimeDefaults.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+struct RuntimeDefaults
+
+
Public Functions
+
+
+inline RuntimeDefaults (
+
+
+std :: optional < std :: vector < SizeType32 > > maxAttentionWindowVec ,
+std :: optional < SizeType32 > sinkTokenLength ,
+
+
+)
+
+
+
+
+RuntimeDefaults ( ) = default
+
+
+
+
+
Public Members
+
+
+std :: optional < std :: vector < SizeType32 > > maxAttentionWindowVec
+
+
+
+
+std :: optional < SizeType32 > sinkTokenLength
@@ -9754,6 +5421,2213 @@
+
+
+rawEngine.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class RawEngine
+
+
Public Types
+
+
+enum Type
+Values:
+
+
+enumerator FilePath
+
+
+
+
+enumerator AddressWithSize
+
+
+
+
+enumerator HostMemory
+
+
+
+
+
+
+
Public Functions
+
+
+inline explicit RawEngine ( std :: filesystem :: path enginePath ) noexcept
+
+
+
+
+inline explicit RawEngine (
+
+
+void const * engineAddr ,
+std :: size_t engineSize ,
+
+
+) noexcept
+
+
+
+
+inline explicit RawEngine (
+
+
+nvinfer1 :: IHostMemory const * engineBuffer ,
+
+
+) noexcept
+
+
+
+
+inline Type getType ( ) const
+
+
+
+
+inline std :: filesystem :: path getPath ( ) const
+
+
+
+
+inline std :: optional < std :: filesystem :: path > getPathOpt ( ) const
+
+
+
+
+inline void setPath ( std :: filesystem :: path enginePath )
+
+
+
+
+inline std :: optional < std :: map < std :: string , tensorrt_llm :: executor :: Tensor > > const & getManagedWeightsMapOpt (
+
+
+
+
+) const
+
+
+
+
+inline void setManagedWeightsMap (
+
+
+std :: map < std :: string , tensorrt_llm :: executor :: Tensor > managedWeightsMap ,
+
+
+)
+
+
+
+
+inline void const * getAddress ( ) const
+
+
+
+
+inline std :: size_t getSize ( ) const
+
+
+
+
+inline nvinfer1 :: IHostMemory const * getHostMemory ( ) const
+
+
+
+
+
Public Members
+
+
+void const * mEngineAddr = { }
+
+
+
+
+std :: size_t mEngineSize = { }
+
+
+
+
+
Private Members
+
+
+Type mType
+
+
+
+
+std :: optional < std :: filesystem :: path > mEnginePath
+
+
+
+
+struct tensorrt_llm::runtime::RawEngine
+
+
+
+
+nvinfer1 :: IHostMemory const * mEngineBuffer = { }
+
+
+
+
+std :: optional < std :: map < std :: string , tensorrt_llm :: executor :: Tensor > > mManagedWeightsMap
+
+
+
+
+
+
+
+
+
+
+
+gptDecoder.h
+
+
+namespace tensorrt_llm
+
+
+namespace layers
+
+
+
+
+namespace runtime
+
+
Functions
+
+
+inline runtime :: ITensor :: SharedConstPtr getDefaultBatchSlots (
+
+
+runtime :: SizeType32 batchSize ,
+
+
+)
+Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.
+
+
+
+
+
+template < typename T > class GptDecoder : public virtual tensorrt_llm :: runtime :: IGptDecoder
+
+
+
Public Functions
+
+
+GptDecoder (
+
+
+executor :: DecodingMode const & mode ,
+size_t maxBatchSize ,
+size_t maxBeamWidth ,
+size_t vocabSize ,
+size_t vocabSizePadded ,
+CudaStreamPtr const & stream ,
+std :: shared_ptr < SpeculativeDecodingModule const > speculativeDecodingModule = nullptr ,
+
+
+)
+
+
+
+
+virtual void setup (
+
+
+SamplingConfig const & samplingConfig ,
+size_t batchSize ,
+TensorConstPtr const & batchSlots ,
+std :: optional < DecodingOutput > const & output = std :: nullopt ,
+std :: optional < nvinfer1 :: DataType > explicitDraftTokensDType = std :: nullopt ,
+std :: optional < std :: vector < TensorConstPtr > > const & lookaheadPrompt = std :: nullopt ,
+std :: optional < std :: vector < executor :: LookaheadDecodingConfig > > const & lookaheadAlgoConfigs = std :: nullopt ,
+
+
+) override
+
+Parameters:
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+
+
+
+
+
+
+virtual void forwardAsync (
+
+
+DecodingOutput & output ,
+DecodingInput const & input ,
+
+
+) override
+
+
+
+
+virtual void forwardSync (
+
+
+DecodingOutput & output ,
+DecodingInput const & input ,
+
+
+) override
+
+
+
+
+inline virtual SamplingConfig const & getSamplingConfig ( ) override
+
+
+
+
+std :: optional < SamplingConfig > const & samplingConfig ,
+SizeType32 batchSize ,
+TensorConstPtr batchSlots ,
+
+
+) override
+
+
+
+
+
Private Members
+
+
+std :: shared_ptr < BufferManager > mManager
+
+
+
+
+std :: shared_ptr < tensorrt_llm :: layers :: DynamicDecodeLayer < T > > mDynamicDecodeLayer
+
+
+
+
+std :: shared_ptr < tensorrt_llm :: runtime :: DecodingLayerWorkspace > mDecodingLayerWorkspace
+
+
+
+
+SamplingConfig mSamplingConfig
+
+
+
+
+size_t mMaxBatchSize
+
+
+
+
+size_t mVocabSize
+
+
+
+
+size_t mVocabSizePadded
+
+
+
+
+executor :: DecodingMode mDecodingMode
+
+
+
+
+
+
+
+class IGptDecoder
+Subclassed by tensorrt_llm::runtime::GptDecoder< T >
+
+
+
Public Functions
+
+
+virtual ~IGptDecoder ( ) = default
+
+
+
+
+virtual void setup (
+
+
+SamplingConfig const & samplingConfig ,
+size_t batchSize ,
+TensorConstPtr const & batchSlots ,
+std :: optional < DecodingOutput > const & output = std :: nullopt ,
+std :: optional < nvinfer1 :: DataType > explicitDraftTokensDType = std :: nullopt ,
+std :: optional < std :: vector < TensorConstPtr > > const & lookaheadPrompt = std :: nullopt ,
+std :: optional < std :: vector < executor :: LookaheadDecodingConfig > > const & lookaheadAlgoConfigs = std :: nullopt ,
+
+
+) = 0
+
+Parameters:
+explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
+
+
+
+
+
+
+virtual void forwardAsync (
+
+
+DecodingOutput & output ,
+DecodingInput const & input ,
+
+
+) = 0
+
+
+
+
+virtual void forwardSync (
+
+
+DecodingOutput & output ,
+DecodingInput const & input ,
+
+
+) = 0
+
+
+
+
+virtual SamplingConfig const & getSamplingConfig ( ) = 0
+
+
+
+
+std :: optional < SamplingConfig > const & samplingConfig ,
+SizeType32 batchSize ,
+TensorConstPtr batchSlots ,
+
+
+) = 0
+
+
+
+
+
+
+
+
+
+
+
+
+eagleBuffers.h
+
+
+namespace tensorrt_llm
+
+
+namespace batch_manager
+
+
+
+
+namespace runtime
+
+
+class EagleBuffers
+
+
+
Public Functions
+
+
+EagleBuffers (
+
+
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+runtime :: BufferManager const & manager ,
+runtime :: ModelConfig const & modelConfig ,
+runtime :: WorldConfig const & worldConfig ,
+executor :: DecodingConfig const & decodingConfig ,
+
+
+)
+
+
+
+
+void reshape (
+
+
+SizeType32 numCtxSequences ,
+SizeType32 numGenSequences ,
+runtime :: ModelConfig const & modelConfig ,
+
+
+)
+
+
+
+
+void setFromInputs (
+
+
+RequestVector const & contextRequests ,
+RequestVector const & genRequests ,
+runtime :: ITensor const & requestTypes ,
+ITensor const & seqSlots ,
+EagleBuffers :: Inputs const & decoderBuffers ,
+runtime :: BufferManager const & manager ,
+runtime :: ModelConfig const & modelConfig ,
+runtime :: WorldConfig const & worldConfig ,
+
+
+) const
+
+
+
+
+void insertInputTensors (
+
+
+TensorMap & inputBuffers ,
+TensorMap & outputBuffers ,
+runtime :: WorldConfig const & worldConfig ,
+
+
+) const
+
+
+
+
+
+
+
Private Members
+
+
+std :: size_t scanReduceTempStorageBytes = { 0 }
+
+
+
+
+float mDefaultPosteriorThreshold = { 0.09f }
+
+
+
+
+bool mDoGreedySampling = { true }
+
+
+
+
+BufferPtr scanReduceTempStorage
+
+
+
+
+TensorPtr cumSumGenerationLengths
+
+
+
+
+TensorPtr maxGenerationLength
+
+
+
+
+TensorPtr chunkedContextNextTokensHost
+
+
+
+
+TensorPtr greedySamplingHost
+
+
+
+
+TensorPtr posteriorAlphaHost
+
+
+
+
+TensorPtr posteriorThresholdHost
+
+
+
+
+
+class EngineOutputs
+
+
Public Members
+
+
+TensorPtr nextDraftTokens
+[batchSize, maxDecodingDraftTokens]
+
+
+
+
+TensorPtr nextDraftLens
+[batchSize]
+
+
+
+
+TensorPtr nextDraftPaths
+[batchSize, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr acceptedTokens
+[batchSize, maxPathLen]
+
+
+
+
+TensorPtr acceptedLens
+[batchSize]
+
+
+
+
+TensorPtr acceptedPaths
+[batchSize]
+
+
+
+
+TensorPtr chunkedContextNextTokens
+[batchSize]
+
+
+
+
+
+
+
+class Inputs
+
+
+
Public Members
+
+
+TensorPtr temperatures
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr posteriorAlpha
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr posteriorThreshold
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr randomDataSample
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr randomDataValidation
+[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]
+
+
+
+
+TensorPtr draftTokens
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+
+
+
+
+TensorPtr draftLens
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr draftPaths
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr draftPathsHost
+[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr specDecodingGenerationLengths
+[maxBatchSize] or [numGenSequences]
+
+
+
+
+TensorPtr specDecodingGenerationLengthsHost
+[maxBatchSize] or [numGenSequences]
+
+
+
+
+TensorPtr specDecodingPackedMasks
+[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+
+
+
+
+TensorPtr specDecodingPositionOffsets
+[maxBatchSize] or [numGenSequences]
+
+
+
+
+TensorPtr eagleNetCtxRequestTypesHost
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr eagleNetCtxContextLengthsHost
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr eagleNetCtxPastKeyValueLengthsHost
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr eagleNetGenRequestTypesHost
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr eagleNetGenContextLengthsHost
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr eagleNetGenPastKeyValueLengthsHost
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr inputGenTokensHost
+[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]
+
+
+
+
+TensorPtr chunkedContextNextTokens
+[maxBatchSize] or [numSequences]
+
+
+
+
+TensorPtr useSpecDecoding
+[1]
+
+
+
+
+TensorPtr useDynamicTreeHost
+[1]
+
+
+
+
+TensorPtr dynamicTreeMaxTopKHost
+[1]
+
+
+
+
+TensorPtr prevScores
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+
+
+
+
+TensorPtr currentExpandIndices
+[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+
+
+
+
+[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
+
+
+
+
+TensorPtr allLayersDraftTokenIds
+[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
+
+
+
+
+TensorPtr allLayersDraftTokenIdsPredecessor
+[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]
+
+
+
+
+
+
+
+
+
+
+
+
+
+medusaModule.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class MedusaModule : public tensorrt_llm :: runtime :: SpeculativeDecodingModule
+
+
+
Public Functions
+
+
+inline explicit MedusaModule (
+
+
+SizeType32 maxAcceptedTokens ,
+SizeType32 maxDraftTokens ,
+
+
+) noexcept
+
+
+
+
+inline explicit MedusaModule ( ) noexcept
+
+
+
+
+inline MedusaChoices const & getMedusaChoices ( ) const noexcept
+
+
+
+
+
Private Members
+
+
+MedusaChoices mDefaultMedusaChoices = { { 0 } , { 0 , 0 } , { 1 } , { 0 , 1 } , { 2 } , { 0 , 0 , 0 } , { 1 , 0 } , { 0 , 2 } , { 3 } , { 0 , 3 } , { 4 } , { 0 , 4 } , { 2 , 0 } , { 0 , 5 } , { 0 , 0 , 1 } , { 5 } , { 0 , 6 } , { 6 } , { 0 , 7 } , { 0 , 1 , 0 } , { 1 , 1 } , { 7 } , { 0 , 8 } , { 0 , 0 , 2 } , { 3 , 0 } , { 0 , 9 } , { 8 } , { 9 } , { 1 , 0 , 0 } , { 0 , 2 , 0 } , { 1 , 2 } , { 0 , 0 , 3 } , { 4 , 0 } , { 2 , 1 } , { 0 , 0 , 4 } , { 0 , 0 , 5 } , { 0 , 0 , 0 , 0 } , { 0 , 1 , 1 } , { 0 , 0 , 6 } , { 0 , 3 , 0 } , { 5 , 0 } , { 1 , 3 } , { 0 , 0 , 7 } , { 0 , 0 , 8 } , { 0 , 0 , 9 } , { 6 , 0 } , { 0 , 4 , 0 } , { 1 , 4 } , { 7 , 0 } , { 0 , 1 , 2 } , { 2 , 0 , 0 } , { 3 , 1 } , { 2 , 2 } , { 8 , 0 } , { 0 , 5 , 0 } , { 1 , 5 } , { 1 , 0 , 1 } , { 0 , 2 , 1 } , { 9 , 0 } , { 0 , 6 , 0 } , { 0 , 0 , 0 , 1 } , { 1 , 6 } , { 0 , 7 , 0 } }
+
+
+
+
+
+
+
+
+
+
+
+explicitDraftTokensBuffers.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class ExplicitDraftTokensBuffers
+
+
+
Public Functions
+
+
+ExplicitDraftTokensBuffers (
+
+
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+runtime :: BufferManager const & manager ,
+runtime :: ModelConfig const & modelConfig ,
+runtime :: WorldConfig const & worldConfig ,
+
+
+)
+
+
+
+
+void reshape (
+
+
+SizeType32 numCtxSequences ,
+SizeType32 numGenSequences ,
+runtime :: ModelConfig const & modelConfig ,
+
+
+)
+
+
+
+
+void setFromInputs (
+
+
+SizeType32 numCtxSequences ,
+SizeType32 numGenSequences ,
+runtime :: ITensor const & requestTypes ,
+ITensor const & seqSlots ,
+ExplicitDraftTokensBuffers :: Inputs const & decoderBuffers ,
+ITensor const & contextPositionIds ,
+runtime :: ModelConfig const & modelConfig ,
+runtime :: WorldConfig const & worldConfig ,
+runtime :: BufferManager const & manager ,
+runtime :: CudaStream const & stream ,
+
+
+) const
+
+
+
+
+void insertInputTensors (
+
+
+TensorMap & inputBuffers ,
+TensorMap & outputBuffers ,
+runtime :: WorldConfig const & worldConfig ,
+
+
+) const
+
+
+
+
+
+
+
+class EngineInputs : public tensorrt_llm :: runtime :: ExplicitDraftTokensBuffers :: Inputs
+
+
Public Members
+
+
+TensorPtr requestTypesDevice
+[numSequences], on gpu
+
+
+
+
+TensorPtr positionOffsets
+[numGenSequences]
+
+
+
+
+
+
+
+class EngineOutputs
+
+
Public Members
+
+
+TensorPtr nextGenerationLengths
+[batchSize]
+
+
+
+
+TensorPtr nextPositionOffsets
+[batchSize]
+
+
+
+
+TensorPtr masks
+[batchSize, maxDecodingTokens, maxDecodingTokens], bool
+
+
+
+
+TensorPtr nextDraftTokens
+[batchSize, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr nextDraftIndices
+[batchSize, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr nextDraftProbs
+[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]
+
+
+
+
+TensorPtr nextFlatTokens
+[batchSize * maxDecodingTokens]
+
+
+
+
+TensorPtr bestPathLengths
+[batchSize]
+
+
+
+
+TensorPtr bestPathIndices
+[batchSize]
+
+
+
+
+TensorPtr maxGenToken
+[1]
+
+
+
+
+TensorPtr totalGenToken
+[1]
+
+
+
+
+TensorPtr packedPositionIds
+[batchSize * maxDecodingTokens]
+
+
+
+
+
+
+
+class Inputs
+Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs
+
+
+
Public Members
+
+
+TensorPtr temperatures
+[maxBatchSize]
+
+
+
+
+TensorPtr positionIdsBase
+[maxBatchSize]
+
+
+
+
+TensorPtr generationLengths
+[maxBatchSize] or [numGenSequences]
+
+
+
+
+TensorPtr randomDataSample
+[maxBatchSize]
+
+
+
+
+TensorPtr randomDataValidation
+[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]
+
+
+
+
+TensorPtr draftTokens
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr draftIndices
+[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]
+
+
+
+
+TensorPtr draftProbs
+[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]
+
+
+
+
+TensorPtr packedMasks
+[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+
+
+
+
+TensorPtr positionIds
+[maxBatchSize] or [numGenSequences]
+
+
+
+
+TensorPtr maxGenLengthHost
+
+
+
+
+TensorPtr generationLengthsHost
+
+
+
+
+TensorPtr useSpecDecoding
+
+
+
+
+
+
+
+
+
+
+
+
+
+iTensor.h
+
+
+namespace nvinfer1
+
+
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
Functions
+
+
+inline std :: ostream & operator << (
+
+
+std :: ostream & output ,
+ITensor :: Shape const & dims ,
+
+
+)
+Utility function to print a shape.
+
+
+
+
+std :: ostream & operator << (
+
+
+std :: ostream & output ,
+ITensor const & tensor ,
+
+
+)
+Utility function to print a tensor with its shape.
+
+
+
+
+template < typename T > T const * bufferCastOrNull (
+
+
+ITensor :: SharedConstPtr const & tensorPtr ,
+
+
+)
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+tensorPtr – A possibly null shared ptr.
+
+Returns:
+A pointer to T const, possibly nullptr.
+
+
+
+
+
+
+template < typename T > T * bufferCastOrNull (
+
+
+ITensor :: SharedPtr const & tensorPtr ,
+
+
+)
+Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+tensorPtr – A possibly null shared ptr.
+
+Returns:
+A pointer to T, possibly nullptr.
+
+
+
+
+
+
+template < typename T > T * bufferCastOrNull (
+
+
+std :: optional < ITensor :: SharedPtr > const & optionalTensorPtr ,
+
+
+)
+Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+optionalBufferPtr – A possibly empty optional.
+
+Returns:
+A pointer to T, possibly nullptr.
+
+
+
+
+
+
+template < typename T > T const * bufferCastOrNull (
+
+
+std :: optional < ITensor :: SharedConstPtr > const & optionalTensorPtr ,
+
+
+)
+Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
+This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.
+
+Template Parameters:
+T – The type of the underlying data.
+
+Parameters:
+optionalBufferPtr – A possibly empty optional.
+
+Returns:
+A pointer to const T, possibly nullptr.
+
+
+
+
+
+
+
+class ITensor : public virtual tensorrt_llm :: runtime :: IBuffer
+
+
Public Types
+
+
+using UniquePtr = std :: unique_ptr < ITensor >
+
+
+
+
+using SharedPtr = std :: shared_ptr < ITensor >
+
+
+
+
+using UniqueConstPtr = std :: unique_ptr < ITensor const >
+
+
+
+
+using SharedConstPtr = std :: shared_ptr < ITensor const >
+
+
+
+
+using Shape = nvinfer1 :: Dims
+
+
+
+
+using DimType64 = std :: remove_reference_t < decltype ( Shape :: d [ 0 ] ) >
+
+
+
+
+using TensorMap = runtime :: StringPtrMap < runtime :: ITensor >
+
+
+
+
+
Public Functions
+
+
+~ITensor ( ) override = default
+
+
+
+
+virtual Shape const & getShape ( ) const = 0
+Returns the tensor dimensions.
+
+
+
+
+template < SizeType32 n > inline DimType64 getDimension ( ) const
+Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.
+
+
+
+
+virtual void reshape ( Shape const & dims ) = 0
+Sets the tensor dimensions. The new size of the tensor will be volume(dims)
+
+
+
+
+inline virtual void resize ( std :: size_t newSize ) override
+Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
+
+
+
+
+ITensor ( ITensor const & ) = delete
+Not allowed to copy.
+
+
+
+
+ITensor & operator = ( ITensor const & ) = delete
+Not allowed to copy.
+
+
+
+
+inline void squeeze ( SizeType32 dim )
+Removes the given unit dimensions from this tensor.
+
+
+
+
+inline void unsqueeze ( SizeType32 dim )
+Adds a unit dimension at the specified position.
+
+
+
+
+inline bool shapeEquals ( Shape const & other ) const
+
+
+
+
+inline bool shapeEquals (
+
+
+std :: initializer_list < SizeType32 > const & other ,
+
+
+) const
+
+
+
+
+template < typename T > inline bool shapeEquals (
+
+
+T const * dims ,
+SizeType32 count ,
+
+
+) const
+
+
+
+
+
Public Static Functions
+
+
+static inline std :: int64_t volume ( Shape const & dims )
+Returns the volume of the dimensions. Returns -1 if d.nbDims < 0 .
+
+
+
+
+static inline std :: size_t volumeNonNegative ( Shape const & shape )
+Returns the volume of the dimensions. Throws if d.nbDims < 0 .
+
+
+
+
+static inline Shape strides ( Shape const & dims )
+Returns the strides of each dimemsion in a Shape.
+
+
+
+
+static Shape squeeze ( Shape const & shape , SizeType32 dim )
+Removes the given unit dimension from shape .
+
+Parameters:
+
+
+Returns:
+A new shape without the unit dimension.
+
+
+
+
+
+
+static Shape unsqueeze ( Shape const & shape , SizeType32 dim )
+Add a unit dimension to shape at the specified position.
+
+Parameters:
+
+
+Returns:
+A new shape with the added unit dimension.
+
+
+
+
+
+
+static UniquePtr slice (
+
+
+SharedPtr tensor ,
+std :: size_t offset ,
+std :: size_t size ,
+
+
+)
+Creates a sliced view on the underlying tensor . The view will have the same data type as tensor .
+
+Parameters:
+
+tensor – The tensor to view.
+offset – The offset of the view w.r.t. dimension 0 of the tensor.
+size – The size of the view w.r.t. dimension 0 of the tensor.
+
+
+Returns:
+A view on the buffer .
+
+
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+std :: size_t offset ,
+std :: size_t size ,
+
+
+)
+
+
+
+
+static inline UniquePtr slice ( SharedPtr tensor , std :: size_t offset )
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+std :: size_t offset ,
+
+
+)
+
+
+
+
+static UniquePtr slice (
+
+
+SharedPtr tensor ,
+Shape const & offsetDims ,
+DimType64 size ,
+
+
+)
+
+Parameters:
+
+offsetDims – The offset in multiple dimensions.
+tensor – The tensor to view.
+offsetDims – The offset dimensions of the view.
+size – The size of the view w.r.t. the last dimension in offsetDims.
+offsetDims – specifies all dimensions.
+
+
+Throws:
+Whenever – offset overflows or the last dimension offset+size overflows.
+
+Returns:
+A view of shape [size, the rest dimensions] or [size] when
+
+
+
+
+
+
+static inline UniquePtr slice (
+
+
+SharedPtr tensor ,
+std :: initializer_list < DimType64 > const & offsetDims ,
+DimType64 size ,
+
+
+)
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+Shape const & offsetDims ,
+std :: size_t size ,
+
+
+)
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+std :: initializer_list < DimType64 > const & offsetDims ,
+std :: size_t size ,
+
+
+)
+
+
+
+
+static inline UniquePtr slice (
+
+
+SharedPtr tensor ,
+Shape const & offsetDims ,
+
+
+)
+return the rest slices at the last dimension when size omitted.
+
+
+
+
+static inline UniquePtr slice (
+
+
+SharedPtr tensor ,
+std :: initializer_list < DimType64 > const & offsetDims ,
+
+
+)
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+Shape const & offsetDims ,
+
+
+)
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
+
+
+TConstPtr & & tensor ,
+std :: initializer_list < DimType64 > const & offsetDims ,
+
+
+)
+
+
+
+
+static inline UniquePtr at ( SharedPtr tensor , Shape const & offsetDims )
+
+Parameters:
+offsetDims – specifies all dimensions.
+
+Returns:
+Just the block at the point, with shape of [the rest dimensions] or [1] when
+
+
+
+
+
+
+static inline UniquePtr at (
+
+
+SharedPtr tensor ,
+std :: initializer_list < DimType64 > const & offsetDims ,
+
+
+)
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr at (
+
+
+TConstPtr & & tensor ,
+Shape const & offsetDims ,
+
+
+)
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline ITensor :: UniqueConstPtr at (
+
+
+TConstPtr & & tensor ,
+std :: initializer_list < DimType64 > const & offsetDims ,
+
+
+)
+
+
+
+
+static UniquePtr view ( IBuffer :: SharedPtr buffer , Shape const & dims )
+Returns a view on the underlying buffer (or tensor) with the given shape.
+
+Parameters:
+
+
+Returns:
+A view on the tensor .
+
+
+
+
+
+
+template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr view (
+
+
+TConstPtr & & tensor ,
+Shape const & dims ,
+
+
+)
+
+
+
+
+static inline UniquePtr view ( SharedPtr tensor )
+Returns a view on the underlying tensor which can be independently reshaped.
+
+Parameters:
+tensor – The tensor to view.
+
+Returns:
+A view on the tensor .
+
+
+
+
+
+
+static inline UniquePtr flattenN (
+
+
+SharedPtr tensor ,
+std :: int64_t sliceN = - 1 ,
+
+
+)
+Returns a flattened view on the underlying tensor which can be independently reshaped.
+
+Parameters:
+
+
+Returns:
+A flatten view on the tensor .
+
+
+
+
+
+
+static UniquePtr wrap (
+
+
+void * data ,
+nvinfer1 :: DataType type ,
+Shape const & shape ,
+std :: size_t capacity ,
+
+
+)
+Wraps the given data in an ITensor . The ITensor will not own the underlying data and cannot be reshaped beyond capacity .
+
+Parameters:
+
+data – The data to wrap.
+type – The data type of the data .
+shape – The shape of the tensor.
+capacity – The capacity of the buffer.
+
+
+Returns:
+An ITensor .
+
+
+
+
+
+
+static inline UniquePtr wrap (
+
+
+void * data ,
+nvinfer1 :: DataType type ,
+Shape const & shape ,
+
+
+)
+
+
+
+
+template < typename T > static inline UniquePtr wrap (
+
+
+T * data ,
+Shape const & shape ,
+std :: size_t capacity ,
+
+
+)
+
+
+
+
+template < typename T > static inline UniquePtr wrap (
+
+
+T * data ,
+Shape const & shape ,
+
+
+)
+
+
+
+
+template < typename T > static inline UniquePtr wrap (
+
+
+std :: vector < T > & v ,
+Shape const & shape ,
+
+
+)
+
+
+
+
+static Shape makeShape (
+
+
+std :: initializer_list < DimType64 > const & dims ,
+
+
+)
+A convenience function to create a tensor shape with the given dimensions.
+
+
+
+
+static std :: string toString ( Shape const & dims )
+A convenience function for converting a tensor shape to a string .
+
+
+
+
+static inline bool shapeEquals ( Shape const & lhs , Shape const & rhs )
+A convenience function to compare shapes.
+
+
+
+
+template < typename T > static inline bool shapeEquals (
+
+
+Shape const & lhs ,
+T const * dims ,
+SizeType32 count ,
+
+
+)
+A convenience function to compare shapes.
+
+
+
+
+
Protected Functions
+
+
+ITensor ( ) = default
+
+
+
+
+
Protected Static Functions
+
+
+static inline DimType64 castSize ( size_t newSize )
+
+
+
+
+
Friends
+
+
+friend class ITensorBindings
+
+
+
+
+
+
+
+
+
+
+
+common.h
+
+
Defines
+
+
+FMT_DIM
+
+
+
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
Typedefs
+
+
+using SizeType32 = std :: int32_t
+
+
+
+
+using SizeType64 = std :: int64_t
+
+
+
+
+using TokenIdType = std :: int32_t
+
+
+
+
+using LoraTaskIdType = std :: uint64_t
+
+
+
+
+
+
+
+
+
+
+
+
+using VecUniqueTokens = std :: vector < UniqueToken >
+
+
+
+
+template < typename T > using StringPtrMap = std :: unordered_map < std :: string , std :: shared_ptr < T > >
+
+
+
+
+
Enums
+
+
+enum class RequestType : std :: int32_t
+Values:
+
+
+enumerator kCONTEXT
+
+
+
+
+enumerator kGENERATION
+
+
+
+
+
+
+
+struct UniqueToken
+
+
Public Functions
+
+
+inline bool operator == ( UniqueToken const & other ) const noexcept
+
+
+
+
+
+
+
+
+
+
loraCachePageManagerConfig.h
@@ -9981,6 +7855,1486 @@
+
+
+worldConfig.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class WorldConfig
+
+
Public Functions
+
+
+explicit WorldConfig (
+
+
+SizeType32 tensorParallelism = 1 ,
+SizeType32 pipelineParallelism = 1 ,
+SizeType32 contextParallelism = 1 ,
+SizeType32 rank = 0 ,
+SizeType32 gpusPerNode = kDefaultGpusPerNode ,
+std :: optional < std :: vector < SizeType32 > > const & deviceIds = std :: nullopt ,
+bool enableAttentionDP = false ,
+
+
+)
+
+
+
+
+inline SizeType32 constexpr getSize ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getTensorParallelism ( ) const noexcept
+
+
+
+
+inline bool constexpr isTensorParallel ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getPipelineParallelism ( ) const noexcept
+
+
+
+
+inline bool constexpr isPipelineParallel ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getContextParallelism ( ) const noexcept
+
+
+
+
+inline bool constexpr isContextParallel ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getGpusPerNode ( ) const noexcept
+
+
+
+
+inline SizeType32 getGpusPerGroup ( ) const noexcept
+
+
+
+
+inline SizeType32 getDevice ( ) const noexcept
+
+
+
+
+inline SizeType32 getDeviceOf ( SizeType32 rank ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getPipelineParallelRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getTensorParallelRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getContextParallelRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getLocalRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getNodeRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getNodeRankOf (
+
+
+SizeType32 rank ,
+
+
+) const noexcept
+
+
+
+
+inline bool constexpr isFirstPipelineParallelRank ( ) const noexcept
+
+
+
+
+inline bool constexpr isLastPipelineParallelRank ( ) const noexcept
+Is my rank the last rank in its pipeline?
+
+
+
+
+inline bool constexpr isFirstTensorParallelRank ( ) const noexcept
+
+
+
+
+inline bool constexpr isFirstContextParallelRank ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr getLastRank ( ) const noexcept
+
+
+
+
+inline bool constexpr enableAttentionDP ( ) const noexcept
+
+
+
+
+std :: vector < SizeType32 > getPipelineParallelGroup ( ) const
+
+
+
+
+std :: vector < SizeType32 > getTensorParallelGroup ( ) const
+
+
+
+
+std :: vector < SizeType32 > getContextParallelGroup ( ) const
+
+
+
+
+bool validMpiConfig ( ) const
+
+
+
+
+
Public Static Functions
+
+
+static WorldConfig mpi (
+
+
+SizeType32 gpusPerNode = kDefaultGpusPerNode ,
+std :: optional < SizeType32 > tensorParallelism = std :: nullopt ,
+std :: optional < SizeType32 > pipelineParallelism = std :: nullopt ,
+std :: optional < SizeType32 > contextParallelism = std :: nullopt ,
+std :: optional < std :: vector < SizeType32 > > const & deviceIds = std :: nullopt ,
+bool enableAttentionDP = false ,
+
+
+)
+
+
+
+
+
Public Static Attributes
+
+
+static SizeType32 constexpr kDefaultGpusPerNode = 1
+
+
+
+
+
+
+
+
+
+
+
+
+loraModule.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
Functions
+
+
+inline std :: ostream & operator << (
+
+
+std :: ostream & output ,
+LoraModule const & module ,
+
+
+)
+
+
+
+
+
+class LoraModule
+
+
Public Types
+
+
+enum class ModuleType : SizeType32
+Values:
+
+
+enumerator kINVALID
+
+
+
+
+enumerator kATTN_QKV
+
+
+
+
+enumerator kATTN_Q
+
+
+
+
+enumerator kATTN_K
+
+
+
+
+enumerator kATTN_V
+
+
+
+
+enumerator kATTN_DENSE
+
+
+
+
+enumerator kMLP_H_TO_4H
+
+
+
+
+enumerator kMLP_4H_TO_H
+
+
+
+
+enumerator kMLP_GATE
+
+
+
+
+enumerator kCROSS_ATTN_QKV
+
+
+
+
+enumerator kCROSS_ATTN_Q
+
+
+
+
+enumerator kCROSS_ATTN_K
+
+
+
+
+enumerator kCROSS_ATTN_V
+
+
+
+
+enumerator kCROSS_ATTN_DENSE
+
+
+
+
+enumerator kMOE_H_TO_4H
+
+
+
+
+enumerator kMOE_4H_TO_H
+
+
+
+
+enumerator kMOE_GATE
+
+
+
+
+enumerator kMOE_ROUTER
+
+
+
+
+enumerator kMLP_ROUTER
+
+
+
+
+enumerator kMLP_GATE_UP
+
+
+
+
+
+
+using TensorPtr = ITensor :: SharedPtr
+
+
+
+
+
Public Functions
+
+
+inline explicit constexpr LoraModule (
+
+
+ModuleType const & t ,
+SizeType32 inDim ,
+SizeType32 outDim ,
+bool inDimFirst ,
+bool outDimFirst ,
+SizeType32 inTpSplitDim ,
+SizeType32 outTpSplitDim ,
+
+
+) noexcept
+
+
+
+
+inline explicit constexpr LoraModule ( ) noexcept
+
+
+
+
+explicit constexpr LoraModule ( LoraModule const & o ) = default
+
+
+
+
+constexpr LoraModule & operator = ( LoraModule const & o ) = default
+
+
+
+
+inline SizeType32 constexpr flattenedInOutSize (
+
+
+SizeType32 adapterSize ,
+bool isDora ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr inSize (
+
+
+SizeType32 adapterSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr outSize (
+
+
+SizeType32 adapterSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localInSize (
+
+
+SizeType32 adapterSize ,
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localOutSize (
+
+
+SizeType32 adapterSize ,
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localScalesSize (
+
+
+SizeType32 tpSize ,
+bool isDora ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localInDim (
+
+
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localOutDim (
+
+
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localInAdapterSize (
+
+
+SizeType32 adapterSize ,
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localOutAdapterSize (
+
+
+SizeType32 adapterSize ,
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localInOutSize (
+
+
+SizeType32 adapterSize ,
+SizeType32 tpSize ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr localTotalSize (
+
+
+SizeType32 adapterSize ,
+SizeType32 tpSize ,
+bool isDora ,
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 constexpr value ( ) const noexcept
+
+
+
+
+inline std :: string_view constexpr name ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr inDim ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr outDim ( ) const noexcept
+
+
+
+
+inline bool constexpr inDimFirst ( ) const noexcept
+
+
+
+
+inline bool constexpr outDimFirst ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr inTpSplitDim ( ) const noexcept
+
+
+
+
+inline SizeType32 constexpr outTpSplitDim ( ) const noexcept
+
+
+
+
+
Public Static Functions
+
+
+static std :: vector < LoraModule > createLoraModules (
+
+
+std :: vector < std :: string > const & loraModuleNames ,
+SizeType32 hiddenSize ,
+SizeType32 mlpHiddenSize ,
+SizeType32 numAttentionHeads ,
+SizeType32 numKvAttentionHeads ,
+SizeType32 attentionHeadSize ,
+SizeType32 tpSize ,
+SizeType32 numExperts ,
+
+
+)
+
+
+
+
+static inline ModuleType constexpr toModuleType (
+
+
+std :: string_view const & name ,
+
+
+)
+
+
+
+
+static inline std :: string_view constexpr toModuleName (
+
+
+ModuleType t ,
+
+
+) noexcept
+
+
+
+
+static inline std :: string_view constexpr toModuleName ( SizeType32 id )
+
+
+
+
+
+
+
+
+
+
+
+
+speculativeDecodingMode.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class SpeculativeDecodingMode
+
+
Public Types
+
+
+using UnderlyingType = std :: uint8_t
+
+
+
+
+
Public Functions
+
+
+inline bool constexpr isNone ( ) const
+
+
+
+
+inline bool constexpr isDraftTokensExternal ( ) const
+
+
+
+
+inline bool constexpr isMedusa ( ) const
+
+
+
+
+inline bool constexpr isLookaheadDecoding ( ) const
+
+
+
+
+inline bool constexpr isExplicitDraftTokens ( ) const
+
+
+
+
+inline bool constexpr isEagle ( ) const
+
+
+
+
+inline bool constexpr updatesPositionIds ( ) const
+
+
+
+
+inline bool constexpr requiresAttentionMask ( ) const
+
+
+
+
+inline bool constexpr predictsDraftTokens ( ) const
+
+
+
+
+inline bool constexpr needsKVCacheRewind ( ) const
+
+
+
+
+inline bool constexpr variableDraftLength ( ) const
+
+
+
+
+inline bool constexpr hasDraftLogits ( ) const
+
+
+
+
+inline bool constexpr needsDecoderPrologue ( ) const
+
+
+
+
+inline bool operator == ( SpeculativeDecodingMode const & other ) const
+
+
+
+
+inline explicit constexpr SpeculativeDecodingMode (
+
+
+UnderlyingType state ,
+
+
+)
+
+
+
+
+
Public Static Functions
+
+
+static inline auto constexpr None ( )
+
+
+
+
+static inline auto constexpr DraftTokensExternal ( )
+
+
+
+
+static inline auto constexpr Medusa ( )
+
+
+
+
+static inline auto constexpr LookaheadDecoding ( )
+
+
+
+
+static inline auto constexpr ExplicitDraftTokens ( )
+
+
+
+
+static inline auto constexpr Eagle ( )
+
+
+
+
+
Private Functions
+
+
+inline bool constexpr anyBitSet ( UnderlyingType bits ) const
+
+
+
+
+inline bool constexpr allBitSet ( UnderlyingType bits ) const
+
+
+
+
+
+
Private Static Attributes
+
+
+static UnderlyingType constexpr kNone = { 1U << 0U }
+
+
+
+
+static UnderlyingType constexpr kDraftTokensExternal = { 1U << 1U }
+
+
+
+
+static UnderlyingType constexpr kMedusa = { 1U << 2U }
+
+
+
+
+static UnderlyingType constexpr kLookaheadDecoding = { 1U << 3U }
+
+
+
+
+static UnderlyingType constexpr kExplicitDraftTokens = { 1U << 4U }
+
+
+
+
+static UnderlyingType constexpr kEagle = { 1U << 5U }
+
+
+
+
+
+
+
+
+
+
+
+cudaEvent.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class CudaEvent
+
+
Public Types
+
+
+using pointer = cudaEvent_t
+
+
+
+
+
Public Functions
+
+
+inline explicit CudaEvent ( unsigned int flags = cudaEventDisableTiming )
+Creates a new cuda event. The event will be destroyed in the destructor.
+
+Parameters:
+flags – Flags for event creation. By default, event timing is disabled.
+
+
+
+
+
+
+inline explicit CudaEvent ( pointer event , bool ownsEvent = true )
+Pass an existing cuda event to this object.
+
+Parameters:
+
+
+
+
+
+
+
+inline pointer get ( ) const
+Returns the event associated with this object.
+
+
+
+
+inline void synchronize ( ) const
+Synchronizes the event.
+
+
+
+
+
Private Types
+
+
+using element_type = std :: remove_pointer_t < pointer >
+
+
+
+
+using EventPtr = std :: unique_ptr < element_type , Deleter >
+
+
+
+
+
Private Members
+
+
+EventPtr mEvent
+
+
+
+
+
+class Deleter
+
+
Public Functions
+
+
+inline explicit Deleter ( bool ownsEvent )
+
+
+
+
+inline explicit Deleter ( )
+
+
+
+
+inline constexpr void operator () ( pointer event ) const
+
+
+
+
+
Private Members
+
+
+bool mOwnsEvent
+
+
+
+
+
+
+
+
+
+
+
+
+
speculativeDecodingModule.h
@@ -10143,1635 +9497,2282 @@ one more than decoding draft tokens for prediction from primary head
-
-lookaheadBuffers.h
+
+iGptDecoderBatched.h
namespace tensorrt_llm
+namespace batch_manager
+
+
+
+
namespace runtime
-
-class LookaheadDecodingBuffers
-
+
+class IGptDecoderBatched
+
+#include <iGptDecoderBatched.h>
+GPT decoder class with support for in-flight batching.
+Subclassed by tensorrt_llm::runtime::GptDecoderBatched
+
-
-
-
-
-
-class LookaheadRuntimeBuffers
-
-
-
Public Functions
-
-
-LookaheadRuntimeBuffers (
+
+virtual void setup (
+executor :: DecodingMode const & mode ,
SizeType32 maxBatchSize ,
SizeType32 maxBeamWidth ,
-BufferManager const & manager ,
-ModelConfig const & modelConfig ,
-WorldConfig const & worldConfig ,
-executor :: DecodingConfig const & decodingConfig ,
-TllmRuntime const & runtime ,
-
-
-)
-
-
-
-
-void setFromInputs (
-
-
-SizeType32 numCtxSequences ,
-SizeType32 numGenSequences ,
-ITensor const & requestTypes ,
-ITensor const & seqSlots ,
-LookaheadDecodingBuffers const & decoderLookaheadBuffers ,
-TllmRuntime const & runtime ,
+nvinfer1 :: DataType dtype ,
ModelConfig const & modelConfig ,
WorldConfig const & worldConfig ,
-) const
-
+
) = 0
+
Setup the decoder before calling forward()
+
-
-void reshape (
+
-SizeType32 numCtxSequences ,
-SizeType32 numGenSequences ,
-SizeType32 tokensPerStep ,
+RequestVector const & genRequests ,
+TensorPtr const & batchSlots ,
-)
-
+
) = 0
+
Disable Lookahead decoding.
+
-
-void insertInputTensors (
+
+virtual CudaEvent forwardAsync (
-TensorMap & inputBuffers ,
-TensorMap & outputBuffers ,
-WorldConfig const & worldConfig ,
+decoder :: DecoderState const & decoderState ,
+decoder_batch :: Input const & input ,
-) const
-
+
) = 0
+
Run one step for all requests without blocking the host process and return the token for synchronization.
+
-
-void enableLookaheadDecoding (
+
+virtual void forward (
-SizeType32 maxBatchSize ,
-SizeType32 tokensPerStep ,
+decoder :: DecoderState const & decoderState ,
+decoder_batch :: Input const & input ,
-)
-
+
) = 0
+
Run one step for all requests and wait for completion on the host.
+
-
-void disableLookaheadDecoding ( )
-
-
-
-
-
-
-
-
-
-
-
-
-promptTuningParams.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-template < typename TTensor > class GenericPromptTuningParams
-
-
-
-
Public Members
-
-
-TensorPtr embeddingTable
-
-
-
-
-TensorPtr tasks
-
-
-
-
-TensorPtr vocabSize
-
-
-
-
-std :: vector < bool > promptTuningEnabled
-
-
-
-
-
-
-
-class PromptTuningParams : public tensorrt_llm :: runtime :: GenericPromptTuningParams < ITensor :: SharedPtr >
-
-
-
Public Functions
-
-
-inline explicit PromptTuningParams (
-
-
-TensorPtr embeddingTable = nullptr ,
-TensorPtr tasks = nullptr ,
-TensorPtr vocabSize = nullptr ,
-
-
-)
-
-
-
-
-void fillTasksTensor (
-
-
-TensorPtr tasksHost ,
-SizeType32 batchSize ,
-SizeType32 numContextRequests ,
-std :: vector < SizeType32 > const & reqBeamWidths ,
-std :: vector < SizeType32 > const & reqPromptLengths ,
-BufferManager const & manager ,
-bool packedInput ,
-
-
-)
-
-
-
-
-
-
-
-
-
-
-
-medusaModule.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class MedusaModule : public tensorrt_llm :: runtime :: SpeculativeDecodingModule
-
-
-
Public Functions
-
-
-inline explicit MedusaModule (
-
-
-SizeType32 maxAcceptedTokens ,
-SizeType32 maxDraftTokens ,
-
-
-) noexcept
-
-
-
-
-inline explicit MedusaModule ( ) noexcept
-
-
-
-
-inline MedusaChoices const & getMedusaChoices ( ) const noexcept
-
-
-
-
-
Private Members
-
-
-MedusaChoices mDefaultMedusaChoices = { { 0 } , { 0 , 0 } , { 1 } , { 0 , 1 } , { 2 } , { 0 , 0 , 0 } , { 1 , 0 } , { 0 , 2 } , { 3 } , { 0 , 3 } , { 4 } , { 0 , 4 } , { 2 , 0 } , { 0 , 5 } , { 0 , 0 , 1 } , { 5 } , { 0 , 6 } , { 6 } , { 0 , 7 } , { 0 , 1 , 0 } , { 1 , 1 } , { 7 } , { 0 , 8 } , { 0 , 0 , 2 } , { 3 , 0 } , { 0 , 9 } , { 8 } , { 9 } , { 1 , 0 , 0 } , { 0 , 2 , 0 } , { 1 , 2 } , { 0 , 0 , 3 } , { 4 , 0 } , { 2 , 1 } , { 0 , 0 , 4 } , { 0 , 0 , 5 } , { 0 , 0 , 0 , 0 } , { 0 , 1 , 1 } , { 0 , 0 , 6 } , { 0 , 3 , 0 } , { 5 , 0 } , { 1 , 3 } , { 0 , 0 , 7 } , { 0 , 0 , 8 } , { 0 , 0 , 9 } , { 6 , 0 } , { 0 , 4 , 0 } , { 1 , 4 } , { 7 , 0 } , { 0 , 1 , 2 } , { 2 , 0 , 0 } , { 3 , 1 } , { 2 , 2 } , { 8 , 0 } , { 0 , 5 , 0 } , { 1 , 5 } , { 1 , 0 , 1 } , { 0 , 2 , 1 } , { 9 , 0 } , { 0 , 6 , 0 } , { 0 , 0 , 0 , 1 } , { 1 , 6 } , { 0 , 7 , 0 } }
-
-
-
-
-
-
-
-
-
-
-
-iBuffer.h
-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
Typedefs
-
-
-template < typename T > using PointerElementType = typename std :: remove_reference_t < T > :: element_type
-
-
-
-
-
Enums
-
-
-enum class MemoryType : std :: int32_t
-Values:
-
-
-enumerator kGPU
-
-
-
-
-enumerator kCPU
-
-
-
-
-enumerator kPINNED
-
-
-
-
-enumerator kUVM
-
-
-
-
-enumerator kPINNEDPOOL
-
-
-
-
-
-
-
Functions
-
-
-template < typename T > std :: shared_ptr < std :: remove_const_t < T > > constPointerCast (
-
-
-std :: shared_ptr < T > const & ptr ,
-
-
-) noexcept
-
-
-
-
-template < typename T , typename D > std :: shared_ptr < std :: remove_const_t < T > > constPointerCast (
-
-
-std :: unique_ptr < T , D > & & ptr ,
-
-
-) noexcept
-
-
-
-
-template < typename T > T const * bufferCast ( IBuffer const & buffer )
-Gets a typed pointer to the constant underlying data of the buffer.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-buffer – The buffer to get a pointer to.
-
-Returns:
-A pointer to constant T .
-
-
-
-
-
-
-template < typename T > T * bufferCast ( IBuffer & buffer )
-Gets a typed pointer to the underlying data of the buffer.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-buffer – The buffer to get a pointer to.
-
-Returns:
-A pointer to T .
-
-
-
-
-
-
-template < typename T > T * bufferCastOrNull (
-
-
-IBuffer :: SharedPtr const & bufferPtr ,
-
-
-)
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-bufferPtr – A possibly null shared ptr.
-
-Returns:
-A pointer to T, possibly nullptr.
-
-
-
-
-
-
-template < typename T > T const * bufferCastOrNull (
-
-
-IBuffer :: SharedConstPtr const & bufferPtr ,
-
-
-)
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-bufferPtr – A possibly null shared ptr.
-
-Returns:
-A pointer to const T, possibly nullptr.
-
-
-
-
-
-
-template < typename T > T * bufferCastOrNull (
-
-
-std :: optional < IBuffer :: SharedPtr > const & optionalBufferPtr ,
-
-
-)
-Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-optionalBufferPtr – A possibly empty optional.
-
-Returns:
-A pointer to T, possibly nullptr.
-
-
-
-
-
-
-template < typename T > T const * bufferCastOrNull (
-
-
-std :: optional < IBuffer :: SharedConstPtr > const & optionalBufferPtr ,
-
-
-)
-Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.
-
-Template Parameters:
-T – The type of the underlying data.
-
-Parameters:
-optionalBufferPtr – A possibly empty optional.
-
-Returns:
-A pointer to const T, possibly nullptr.
-
-
-
-
-
-
-std :: ostream & operator << (
-
-
-std :: ostream & output ,
-IBuffer const & buffer ,
-
-
-)
-Utility function to print a buffer.
-
-
-
-
-
-class BufferDataType
-
-#include <iBuffer.h>
-A wrapper around nvinfer1::DataType that provides a support for pointer types.
-
-
Public Functions
-
-
-inline constexpr BufferDataType (
-
-
-nvinfer1 :: DataType dataType ,
-bool _unsigned = false ,
-bool pointer = false ,
-
-
-)
-
-
-
-
-inline constexpr operator nvinfer1 :: DataType ( ) const noexcept
-
-
-
-
-inline constexpr nvinfer1 :: DataType getDataType ( ) const noexcept
-
-
-
-
-inline constexpr bool isPointer ( ) const noexcept
-
-
-
-
-inline constexpr bool isUnsigned ( ) const
-
-
-
-
-inline constexpr std :: size_t getSize ( ) const noexcept
-
-
-
-
-inline constexpr std :: size_t getSizeInBits ( ) const noexcept
-
-
-
-
-
Public Static Attributes
-
-
-static auto constexpr kTrtPointerType = nvinfer1 :: DataType :: kINT64
-
-
-
-
-
Private Members
-
-
-nvinfer1 :: DataType mDataType
-
-
-
-
-bool mUnsigned
-
-
-
-
-bool mPointer
-
-
-
-
-
-
-
-template < typename T > class BufferRange : public tensorrt_llm :: common :: ArrayView < T >
-
-
Public Types
-
-
-using Base = tensorrt_llm :: common :: ArrayView < T >
-
-
-
-
-
Public Functions
-
-
-inline BufferRange ( T * data , size_type size )
-
-
-
-
-template < typename U = T , std :: enable_if_t < ! std :: is_const_v < U > , bool > = true > inline explicit BufferRange (
-
-
-IBuffer & buffer ,
-
-
-)
-
-
-
-
-template < typename U = T , std :: enable_if_t < std :: is_const_v < U > , bool > = true > inline explicit BufferRange (
-
-
-IBuffer const & buffer ,
-
-
-)
-
-
-
-
-
-
-
-template < nvinfer1 :: DataType kDataType , bool kIsUnsigned = false , bool kIsPointer = false > struct DataTypeTraits
-
-#include <iBuffer.h>
-For converting a TensorRT data type to a C++ data type.
-
-
-
-
-template < nvinfer1 :: DataType kDataType , bool kUnsigned > struct DataTypeTraits < kDataType , kUnsigned , true >
-
-
Public Types
-
-
-using type = typename DataTypeTraits < kDataType , kUnsigned , false > :: type *
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "*"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < bool kUnsigned > struct DataTypeTraits < nvinfer1 :: DataType :: kBOOL , kUnsigned >
-
-
Public Types
-
-
-using type = bool
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "bool"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kFLOAT >
-
-
Public Types
-
-
-using type = float
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "float"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kHALF >
-
-
Public Types
-
-
-using type = half
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "half"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT32 >
-
-
Public Types
-
-
-using type = std :: int32_t
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "int32"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT32 , true >
-
-
Public Types
-
-
-using type = std :: uint32_t
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "uint32"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT64 >
-
-
Public Types
-
-
-using type = std :: int64_t
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "int64"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT64 , true >
-
-
Public Types
-
-
-using type = std :: uint64_t
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "uint64"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < > struct DataTypeTraits < nvinfer1 :: DataType :: kINT8 >
-
-
Public Types
-
-
-using type = std :: int8_t
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "int8"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-template < bool kUnsigned > struct DataTypeTraits < nvinfer1 :: DataType :: kUINT8 , kUnsigned >
-
-
Public Types
-
-
-using type = std :: uint8_t
-
-
-
-
-
Public Static Attributes
-
-
-static char constexpr name [ ] = "uint8"
-
-
-
-
-static auto constexpr size = sizeof ( type )
-
-
-
-
-
-
-
-class IBuffer
-Subclassed by tensorrt_llm::runtime::ITensor
-
-
Public Types
-
-
-using UniquePtr = std :: unique_ptr < IBuffer >
-
-
-
-
-using SharedPtr = std :: shared_ptr < IBuffer >
-
-
-
-
-using UniqueConstPtr = std :: unique_ptr < IBuffer const >
-
-
-
-
-using SharedConstPtr = std :: shared_ptr < IBuffer const >
-
-
-
-
-using DataType = nvinfer1 :: DataType
-
-
-
-
-
Public Functions
-
-
-virtual void * data ( ) = 0
-Returns a pointer to underlying array.
-
-
-
-
-virtual void const * data ( ) const = 0
-Returns a pointer to underlying array.
-
-
-
-
-inline virtual void * data ( std :: size_t index )
-Returns a pointer to the underlying array at a given element index.
-
-
-
-
-inline virtual void const * data ( std :: size_t index ) const
-Returns a pointer to the underlying array at a given element index.
-
-
-
-
-virtual std :: size_t getSize ( ) const = 0
-Returns the size (in number of elements) of the buffer.
-
-
-
-
-inline virtual std :: size_t getSizeInBytes ( ) const
-Returns the size (in bytes) of the buffer.
-
-
-
-
-virtual std :: size_t getCapacity ( ) const = 0
-Returns the capacity of the buffer.
-
-
-
-
-virtual DataType getDataType ( ) const = 0
-Returns the data type of the buffer.
-
-
-
-
-virtual char const * getDataTypeName ( ) const
-
-
-
-
-virtual MemoryType getMemoryType ( ) const = 0
-Returns the memory type of the buffer.
-
-
-
-
-virtual char const * getMemoryTypeName ( ) const
-
-
-
-
-virtual void resize ( std :: size_t newSize ) = 0
-Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
-
-
-
-virtual void release ( ) = 0
-Releases the buffer. It will be reset to nullptr.
-
-
-
-
-virtual ~IBuffer ( ) = default
-
-
-
-
-IBuffer ( IBuffer const & ) = delete
-Not allowed to copy.
-
-
-
-
-IBuffer & operator = ( IBuffer const & ) = delete
-Not allowed to copy.
-
-
-
-
-
Public Static Functions
-
-
-static char const * getDataTypeName ( DataType dataType )
-
-
-
-
-static UniquePtr slice (
-
-
-SharedPtr buffer ,
-std :: size_t offset ,
-std :: size_t size ,
-
-
-)
-Creates a sliced view on the underlying buffer . The view will have the same data type as buffer .
-
-Parameters:
-
-buffer – The buffer to view.
-offset – The offset of the view.
-size – The size of the view.
-
-
-Returns:
-A view on the buffer .
-
-
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
-
-
-TConstPtr & & tensor ,
-std :: size_t offset ,
-std :: size_t size ,
-
-
-)
-
-
-
-
-static inline UniquePtr slice ( SharedPtr buffer , std :: size_t offset )
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr slice (
-
-
-TConstPtr & & tensor ,
-std :: size_t offset ,
-
-
-)
-
-
-
-
-static inline UniquePtr view ( SharedPtr tensor )
-Returns a view on the underlying tensor which can be independently resized.
-
-Parameters:
-tensor – The tensor to view.
-
-Returns:
-A view on the tensor .
-
-
-
-
-
-
-static inline UniquePtr view ( SharedPtr tensor , std :: size_t size )
-Returns a view on the underlying tensor with a different size.
-
-Parameters:
-
-
-Returns:
-A view on the tensor .
-
-
-
-
-
-
-template < typename TConstPtr , std :: enable_if_t < std :: is_const_v < PointerElementType < TConstPtr > > , int > = 0 > static inline UniqueConstPtr view (
-
-
-TConstPtr & & tensor ,
-std :: size_t size ,
-
-
-)
-
-
-
-
-static UniquePtr wrap (
-
-
-void * data ,
-DataType type ,
-std :: size_t size ,
-std :: size_t capacity ,
-
-
-)
-Wraps the given data in an IBuffer . The IBuffer will not own the underlying data and cannot be resized beyond capacity .
-
-Parameters:
-
-data – The data to wrap.
-type – The data type of the data .
-size – The size of the buffer.
-capacity – The capacity of the buffer.
-
-
-Returns:
-An IBuffer .
-
-
-
-
-
-
-static inline UniquePtr wrap (
-
-
-void * data ,
-DataType type ,
-std :: size_t size ,
-
-
-)
-
-
-
-
-template < typename T > static inline UniquePtr wrap (
-
-
-T * data ,
-std :: size_t size ,
-std :: size_t capacity ,
-
-
-)
-
-
-
-
-template < typename T > static inline UniquePtr wrap (
-
-
-T * data ,
-std :: size_t size ,
-
-
-)
-
-
-
-
-template < typename T > static inline UniquePtr wrap (
-
-
-std :: vector < T > & v ,
-
-
-)
-
-
-
-
-static MemoryType memoryType ( void const * data )
-Determine the memory type of a pointer.
+) const = 0
+Gather final beam search results for request batchIdx . Result will only be available after event returned.
Protected Functions
-
-IBuffer ( ) = default
+
+IGptDecoderBatched ( ) = default
-
-inline std :: size_t toBytes ( std :: size_t size ) const
-Returns an array index or size in bytes.
-
-
-
-
-
-
-
-template < MemoryType T > struct MemoryTypeString
-
-
-
-
-template < > struct MemoryTypeString < MemoryType :: kCPU >
-
-
Public Static Attributes
-
-
-static auto constexpr value = "CPU"
+
+virtual ~IGptDecoderBatched ( ) = default
-
-
-template < > struct MemoryTypeString < MemoryType :: kGPU >
-
-
Public Static Attributes
-
-
-static auto constexpr value = "GPU"
+
+
+namespace decoder
-
-
-
-
-
-template < > struct MemoryTypeString < MemoryType :: kPINNED >
+
+
+namespace decoder_batch
+
+
+class Input
-
Public Static Attributes
-
-
-static auto constexpr value = "PINNED"
+Public Types
+
+
+using TensorConstPtr = ITensor :: SharedConstPtr
-
-
-
-
-
-template < > struct MemoryTypeString < MemoryType :: kPINNEDPOOL >
-
-
Public Static Attributes
-
-
-static auto constexpr value = "PINNEDPOOL"
-
-
-
-
-
-
-
-template < > struct MemoryTypeString < MemoryType :: kUVM >
-
-
Public Static Attributes
-
-
-static auto constexpr value = "UVM"
-
-
-
-
-
-
-
-template < typename T , bool = false > struct TRTDataType
-
-#include <iBuffer.h>
-For converting a C++ data type to a TensorRT data type.
-
-
-
-
-template < > struct TRTDataType < bool >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kBOOL
-
-
-
-
-
-
-
-template < > struct TRTDataType < float >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kFLOAT
-
-
-
-
-
-
-
-template < > struct TRTDataType < half >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kHALF
-
-
-
-
-
-
-
-template < > struct TRTDataType < kernels :: FinishedState >
-
-
Public Static Attributes
-
-
-static constexpr auto value = TRTDataType < kernels :: FinishedState :: UnderlyingType > :: value
-
-
-
-
-
-
-
-template < > struct TRTDataType < kernels :: KVCacheIndex >
-
-
Public Static Attributes
-
-
-static constexpr auto value = TRTDataType < kernels :: KVCacheIndex :: UnderlyingType > :: value
-
-
-
-
-
-
-
-template < > struct TRTDataType < runtime :: RequestType >
-
-
Public Static Attributes
-
-
-static constexpr auto value = TRTDataType < std :: underlying_type_t < runtime :: RequestType > > :: value
-
-
-
-
-
-
-
-template < > struct TRTDataType < std :: int32_t >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kINT32
-
-
-
-
-
-
-
-template < > struct TRTDataType < std :: int64_t >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kINT64
-
-
-
-
-
-
-
-template < > struct TRTDataType < std :: int8_t >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kINT8
-
-
-
-
-
-
-
-template < > struct TRTDataType < std :: uint32_t >
-
-
Public Static Attributes
-
-
-static constexpr auto value = BufferDataType { nvinfer1 :: DataType :: kINT32 , true }
-
-
-
-
-
-
-
-template < > struct TRTDataType < std :: uint64_t >
-
-
Public Static Attributes
-
-
-static constexpr auto value = BufferDataType { nvinfer1 :: DataType :: kINT64 , true }
-
-
-
-
-
-
-
-template < > struct TRTDataType < std :: uint8_t >
-
-
Public Static Attributes
-
-
-static constexpr auto value = nvinfer1 :: DataType :: kUINT8
-
-
-
-
-
-
-
-template < typename T > struct TRTDataType < T * >
-
-
Private Static Attributes
+
Public Functions
+
+
+inline explicit Input (
+
+
+std :: vector < std :: vector < TensorConstPtr > > const & logits ,
+SizeType32 maxDecoderSteps ,
+
+
+)
+
+
+
+
+inline explicit Input ( std :: vector < TensorConstPtr > const & logits )
+
+
+
+
+
Public Members
-
-static auto constexpr kUnderlyingType = BufferDataType { TRTDataType < std :: remove_const_t < T > , false > :: value }
+
+std :: vector < std :: vector < TensorConstPtr > > logits
+[maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+Mandatory parameters Logits
+
+
+
+
+SizeType32 maxDecoderSteps
+Maximum number of decoding tokens of active slots.
+
+
+
+
+std :: vector < TensorPtr > batchSlots
+Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize].
+
+
+
+
+
+
+
+
+
+
+
+
+
+eagleModule.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class EagleModule : public tensorrt_llm :: runtime :: SpeculativeDecodingModule
+
+
Public Functions
+
+
+inline explicit EagleModule (
+
+
+SizeType32 maxDraftPathLen ,
+SizeType32 maxDecodingDraftTokens ,
+SizeType32 numTransformersLayer ,
+SizeType32 maxNonLeafNodesPerLayer ,
+
+
+) noexcept
+
+
+
+
+inline explicit EagleModule ( ) noexcept
+
+
+
+
+inline executor :: EagleChoices const & getDefaultEagleChoices (
+
+
+
+
+) const noexcept
+
+
+
+
+inline SizeType32 getNumTransformerLayers ( ) const noexcept
+
+
+
+
+inline SizeType32 getMaxNonLeafNodesPerLayer ( ) const noexcept
+
+
+
+
+
Private Members
+
+
+SizeType32 mNumTransformersLayer
+
+
+
+
+SizeType32 mMaxNonLeafNodesPerLayer
+
+
+
+
+executor :: EagleChoices mDefaultEagleChoices = { { 0 } , { 0 , 0 } , { 1 } , { 0 , 1 } , { 2 } , { 0 , 0 , 0 } , { 1 , 0 } , { 0 , 2 } , { 3 } , { 0 , 3 } , { 4 } , { 0 , 4 } , { 2 , 0 } , { 0 , 5 } , { 0 , 0 , 1 } , { 5 } , { 0 , 6 } , { 6 } , { 0 , 7 } , { 0 , 1 , 0 } , { 1 , 1 } , { 7 } , { 0 , 8 } , { 0 , 0 , 2 } , { 3 , 0 } , { 0 , 9 } , { 8 } , { 9 } , { 1 , 0 , 0 } , { 0 , 2 , 0 } , { 1 , 2 } , { 0 , 0 , 3 } , { 4 , 0 } , { 2 , 1 } , { 0 , 0 , 4 } , { 0 , 0 , 5 } , { 0 , 0 , 0 , 0 } , { 0 , 1 , 1 } , { 0 , 0 , 6 } , { 0 , 3 , 0 } , { 5 , 0 } , { 1 , 3 } , { 0 , 0 , 7 } , { 0 , 0 , 8 } , { 0 , 0 , 9 } , { 6 , 0 } , { 0 , 4 , 0 } , { 1 , 4 } , { 7 , 0 } , { 0 , 1 , 2 } , { 2 , 0 , 0 } , { 3 , 1 } , { 2 , 2 } , { 8 , 0 } , { 0 , 5 , 0 } , { 1 , 5 } , { 1 , 0 , 1 } , { 0 , 2 , 1 } , { 9 , 0 } , { 0 , 6 , 0 } , { 0 , 0 , 0 , 1 } , { 1 , 6 } , { 0 , 7 , 0 } }
+
+
+
+
+
+
+
+
+
+
+
+tllmLogger.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class TllmLogger : public nvinfer1 :: ILogger
+
+
Public Functions
+
+
+void log (
+
+
+Severity severity ,
+nvinfer1 :: AsciiChar const * msg ,
+
+
+) noexcept override
+
+
+
+
+Severity getLevel ( )
+
+
+
+
+void setLevel ( Severity level )
+
+
+
+
+
+
+
+
+
+
+
+gptDecoderBatched.h
+
+
+namespace tensorrt_llm
+
+
+namespace batch_manager
+
+
+
+
+namespace runtime
+
+
+class GptDecoderBatched : public tensorrt_llm :: runtime :: IGptDecoderBatched
+
+#include <gptDecoderBatched.h>
+GPT decoder class with support for in-flight batching.
+
+
+
Public Functions
+
+
+explicit GptDecoderBatched ( CudaStreamPtr stream )
+
+
+
+
+virtual void setup (
+
+
+executor :: DecodingMode const & mode ,
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+nvinfer1 :: DataType dtype ,
+ModelConfig const & modelConfig ,
+WorldConfig const & worldConfig ,
+
+
+) override
+Setup the decoder before calling forward()
+
+
+
+
+RequestVector const & genRequests ,
+TensorPtr const & batchSlots ,
+
+
+) override
+Disable Lookahead decoding.
+
+
+
+
+virtual CudaEvent forwardAsync (
+
+
+decoder :: DecoderState const & decoderState ,
+decoder_batch :: Input const & input ,
+
+
+) override
+Run one step for all requests without blocking the host process and return the token for synchronization.
+
+
+
+
+virtual void forward (
+
+
+decoder :: DecoderState const & decoderState ,
+decoder_batch :: Input const & input ,
+
+
+) override
+Run one step for all requests and wait for completion on the host.
+
+
+
+
+virtual CudaEvent finalize (
+
+
+decoder :: DecoderState const & decoderState ,
+SizeType32 batchSlot ,
+SamplingConfig const & samplingConfig ,
+bool streaming ,
+
+
+) const override
+Gather final beam search results for request batchSlot . Result will only be available after event returned.
+
+
+
+
+inline CudaStreamPtr getDecoderStream ( ) const
+
+
+
+
+inline IGptDecoder & getUnderlyingDecoder ( ) const
+
+
+
+
+inline BufferManager const & getBufferManager ( ) const
+
+
+
+
+
Private Types
+
+
+using GptDecoderPtr = std :: unique_ptr < IGptDecoder >
+
+
+
+
+
Private Functions
+
+
+void forwardDispatch (
+
+
+decoder :: DecoderState const & decoderState ,
+decoder_batch :: Input const & input ,
+
+
+)
+Calls decoders for tokens per engine step.
+
+
+
+
+
+
+
+
+
+
+
+
+cudaStream.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class CudaStream
+
+
Public Functions
+
+
+inline explicit CudaStream (
+
+
+unsigned int flags = cudaStreamNonBlocking ,
+int priority = 0 ,
+
+
+)
+Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
+
+Parameters:
+
+flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.
+priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
+
+
+
+
+
+
+
+inline explicit CudaStream (
+
+
+cudaStream_t stream ,
+int device ,
+bool ownsStream = true ,
+
+
+)
+Pass an existing cuda stream to this object.
+
+Parameters:
+
+stream – The stream to pass to this object.
+device – The device on which the stream was created.
+ownsStream – Whether this object owns the stream and destroys it in the destructor.
+
+
+
+
+
+
+
+inline explicit CudaStream ( cudaStream_t stream )
+Construct with an existing cuda stream or the default stream by passing nullptr.
+
+
+
+
+inline int getDevice ( ) const
+Returns the device on which the stream was created.
+
+
+
+
+inline cudaStream_t get ( ) const
+Returns the stream associated with this object.
+
+
+
+
+inline void synchronize ( ) const
+Synchronizes the stream.
+
+
+
+
+inline void record ( CudaEvent :: pointer event ) const
+Record an event on the stream.
+
+
+
+
+inline void record ( CudaEvent const & event ) const
+Record an event on the stream.
+
+
+
+
+inline void wait ( CudaEvent :: pointer event ) const
+Wait for an event.
+
+
+
+
+inline void wait ( CudaEvent const & event ) const
+Wait for an event.
+
+
+
+
+
Private Types
+
+
+using StreamPtr = std :: unique_ptr < std :: remove_pointer_t < cudaStream_t > , Deleter >
+
+
+
+
+
Private Members
+
+
+StreamPtr mStream
+
+
+
+
+int mDevice = { - 1 }
+
+
+
+
+
+class Deleter
+
+
Public Functions
+
+
+inline explicit Deleter ( bool ownsStream )
+
+
+
+
+inline explicit Deleter ( )
+
+
+
+
+inline constexpr void operator () ( cudaStream_t stream ) const
+
+
+
+
+
Private Members
+
+
+bool mOwnsStream
+
+
+
+
+
+
+
+
+
+
+
+
+
+ipcNvlsMemory.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
Functions
+
+
+void MPI_group_barrier ( std :: set < int > ranks )
+
+
+
+
+bool ipcNvlsSupported ( )
+
+
+
+
+IpcNvlsHandle * ipcNvlsAllocate ( size_t size , std :: set < int > ranks )
+
+
+
+
+void ipcNvlsFree ( IpcNvlsHandle * handle )
+
+
+
+
+
+template < typename T > class DeviceAllocationNvls
+
+
Public Functions
+
+
+DeviceAllocationNvls ( ) = default
+
+
+
+
+inline ~DeviceAllocationNvls ( )
+
+
+
+
+inline void reset ( size_t size , std :: set < int > ranks )
+
+
+
+
+inline T * getMulticastPointer ( ) const
+
+
+
+
+inline T * getUnicastPointer ( ) const
+
+
+
+
+inline T * * getIpcUnicastPointers ( )
+
+
+
+
+inline size_t getCapacity ( ) const
+
+
+
+
+inline void free ( )
+
+
+
+
+
Private Members
+
+
+size_t _capacity = 0
+
+
+
+
+IpcNvlsHandle * _handle
-
-template < > struct TRTDataType < void * >
+
+struct IpcNvlsHandle
+
Public Members
+
+
+size_t size = 0
+
+
+
+
+uintptr_t uc_ptr = 0
+
+
+
+
+uintptr_t mc_ptr = 0
+
+
+
+
+std :: vector < uintptr_t > ipc_uc_ptrs
+
+
+
+
+CUdeviceptr uc_va
+
+
+
+
+CUdeviceptr mc_va
+
+
+
+
+std :: vector < CUdeviceptr > ipc_uc_vas
+
+
+
+
+CUmemGenericAllocationHandle uc_handle
+
+
+
+
+CUmemGenericAllocationHandle mc_handle
+
+
+
+
+std :: vector < CUmemGenericAllocationHandle > ipc_uc_handles
+
+
+
+
+
+
+
+
+
+
+
+samplingConfig.h
+
+
Defines
+
+
+SET_FROM_OPTIONAL ( varName , VarName , VarType )
+
+
+
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class SamplingConfig
+
+
Public Functions
+
+
+inline explicit SamplingConfig ( SizeType32 beamWidth = 1 )
+
+
+
+
+inline explicit SamplingConfig (
+
+
+std :: vector < SamplingConfig > const & configs ,
+
+
+)
+
+
+
+
+inline explicit SamplingConfig (
+
+
+executor :: SamplingConfig const & samplingConfig ,
+std :: optional < executor :: ExternalDraftTokensConfig > const & externalDraftTokensConfig = std :: nullopt ,
+
+
+)
+
+
+
+
+inline bool validate ( )
+
+
+
+
+template < typename T > inline bool useDefaultValues (
+
+
+OptVec < T > const & vec ,
+T defaultValue ,
+
+
+)
+
+
+
+
+inline bool operator == ( SamplingConfig const & other ) const
+
+
+
+
+inline SizeType32 getNumReturnBeams ( ) const
+
+
+
+
+inline SizeType32 getMaxBeamWidth ( ) const noexcept
+
+
+
+
+
+
Private Types
+
+
+using FloatType = float
+
+
+
+
+template < typename T > using OptVec = std :: optional < std :: vector < T > >
+
+
+
+
+
Private Functions
+
+
+template < typename T > inline bool validateVec (
+
+
+std :: string name ,
+OptVec < T > const & vec ,
+T min ,
+std :: optional < T > max = std :: nullopt ,
+
+
+)
+
+
+
+
+
Private Static Functions
+
+
+template < typename T > static inline OptVec < T > fuseValues (
+
+
+std :: vector < SamplingConfig > const & configs ,
+std :: function < OptVec < T > ( size_t ci ) > accessor ,
+T defaultValue ,
+
+
+)
+
+
+
+
+
+
+
+
+
+
+
+request.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+namespace decoder_batch
+
+
+class Request
+
+
+
+
+
+
+
+
+
+
+
+
+
+decoderState.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+namespace decoder
+
+
+class BeamSearchBuffers
+
+
Public Functions
+
+
+explicit BeamSearchBuffers ( BufferManager const & bufferManager )
+
+
+
+
+void reshape ( SizeType32 maxBeamWidth , SizeType32 maxSequenceLength )
+
+
+
+
+
+
+
+
+class DecoderState
+
+
+
Public Functions
+
+
+DecoderState ( )
+
+
+
+
+void setup (
+
+
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+SizeType32 maxAttentionWindow ,
+SizeType32 sinkTokenLength ,
+SizeType32 maxSequenceLength ,
+nvinfer1 :: DataType dtype ,
+ModelConfig const & modelConfig ,
+WorldConfig const & worldConfig ,
+BufferManager const & bufferManager ,
+
+
+)
+Setup buffers for the decoder excluding speculative decoding.
+
+
+
+
+void setupCacheIndirection (
+
+
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+SizeType32 maxAttentionWindow ,
+BufferManager const & bufferManager ,
+
+
+)
+Setup buffers for the cache indirection.
+This is used for beam search on pipeline parallel ranks without a decoder.
+
+
+
+
+void setupSpeculativeDecoding (
+
+
+SpeculativeDecodingMode const & speculativeDecodingMode ,
+SizeType32 maxTokensPerEngineStep ,
+nvinfer1 :: DataType dtype ,
+ModelConfig const & modelConfig ,
+WorldConfig const & worldConfig ,
+BufferManager const & bufferManager ,
+
+
+)
+Setup buffers for speculative decoding.
+
+
+
+
+Disable lookahead decoding.
+
+
+
+
+TensorPtr getFinishedSum ( ) const
+
+Returns:
+[batchSize], number of finished sequences per request, on gpu
+
+
+
+
+
+
+TensorPtr getFinishReasons ( ) const
+
+Returns:
+[batchSize, beamWidth], FinishedState value, on gpu
+
+
+
+
+
+
+TensorPtr getIds ( ) const
+
+Returns:
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu. In case of beam search, contains the ungathered data.
+
+
+
+
+
+
+TensorPtr getIds ( SizeType32 batchIdx ) const
+
+Parameters:
+batchIdx – index of the batch
+
+Returns:
+[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx , on gpu. In case of beam search, contains the ungathered data.
+
+
+
+
+
+
+TensorPtr getGatheredIds ( ) const
+
+Returns:
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding, on gpu.
+
+
+
+
+
+
+TensorPtr getGatheredIds ( SizeType32 batchIdx ) const
+
+Parameters:
+batchIdx – index of the batch
+
+Returns:
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], only used for beam search. It contains gathered token ids without padding for request batchIdx , on gpu.
+
+
+
+
+
+
+TensorPtr getParentIds ( ) const
+
+Returns:
+[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
+
+
+
+
+
+
+TensorPtr getCumLogProbs ( ) const
+
+Returns:
+[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
+
+
+
+
+
+
+TensorPtr getCumLogProbs ( SizeType32 batchIdx ) const
+
+Returns:
+[maxBeamWidth], cumulative log probabilities (per beam), on gpu
+
+
+
+
+
+
+TensorPtr getLogProbs ( ) const
+
+Returns:
+[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
+
+
+
+
+
+
+TensorPtr getLogProbs ( SizeType32 batchIdx ) const
+
+Returns:
+[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
+
+
+
+
+
+
+TensorPtr getSequenceLengths ( ) const
+
+Returns:
+[batchSize, maxBeamWidth], sequence lengths, on gpu
+
+
+
+
+
+
+TensorPtr getSequenceLengths ( SizeType32 batchIdx ) const
+
+Parameters:
+batchIdx – index of the batch
+
+Returns:
+[maxBeamWidth], sequence lengths for request batchIdx , on gpu
+
+
+
+
+
+
+TensorPtr getAllNewTokens ( ) const
+Get maxTokensPerStep tokens generated in the last forward pass.
+
+Returns:
+[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
+
+
+
+
+
+
+TensorPtr getNextDraftTokens ( ) const
+
+Returns:
+[batchSize, maxDraftTokens], predicted draft tokens for next step, on gpu
+
+
+
+
+
+
+TensorPtr getPrevDraftTokensLengths ( ) const
+
+Returns:
+[batchSize], predicted draft tokens lengths for previous step, on gpu
+
+
+
+
+
+
+TensorPtr getNextDraftTokensLengths ( ) const
+
+Returns:
+[batchSize], predicted draft tokens lengths for next step, on gpu
+
+
+
+
+
+
+TensorPtr getAcceptedLengthsCumSum ( ) const
+
+Returns:
+[batchSize + 1], exclusive sum of accepted draft token lengths, on gpu
+
+
+
+
+
+
+TensorPtr getAcceptedPackedPaths ( ) const
+
+Returns:
+[batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
+
+
+
+
+
+
+TensorPtr getFinishedSteps ( ) const
+
+Returns:
+[maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
+
+
+
+
+
+
+SizeType32 getMaxBatchSize ( ) const
+
+
+
+
+SizeType32 getMaxBeamWidth ( ) const
+
+
+
+
+SizeType32 getMaxSequenceLength ( ) const
+
+
+
+
+SizeType32 getMaxDecodingDecoderTokens ( ) const
+
+
+
+
+SizeType32 getMaxDecodingEngineTokens ( ) const
+
+
+
+
+std :: vector < SizeType32 > const & getNumDecodingEngineTokens ( ) const
+Get the number of tokens for all requests in the batch.
+
+Returns:
+The number of tokens for all requests in the batch.
+
+
+
+
+
+
+SizeType32 getNumDecodingEngineTokens ( SizeType32 batchIdx ) const
+Get the number of tokens for a specific request in the batch.
+
+Parameters:
+batchIdx – The index of the request in the batch.
+
+Returns:
+The number of tokens for the specified request.
+
+
+
+
+
+
+void setNumDecodingEngineTokens (
+
+
+SizeType32 batchIdx ,
+SizeType32 numTokens ,
+
+
+)
+Set the number of tokens for a specific request in the batch.
+
+Parameters:
+
+
+
+
+
+
+
+SpeculativeDecodingMode getSpeculativeDecodingMode ( ) const
+Get the speculative decoding mode.
+
+
+
+
+ExplicitDraftTokensBuffers :: Inputs const & getExplicitDraftTokensBuffers (
+
+
+
+
+) const
+Get the explicit draft tokens buffers.
+
+
+
+
+EagleBuffers :: Inputs const & getEagleBuffers ( ) const
+Get the eagle buffers.
+
+
+
+
+LookaheadDecodingBuffers const & getLookaheadBuffers ( ) const
+Get the lookahead buffers.
+
+
+
+
+BeamSearchBuffers const & getBeamSearchBuffers ( ) const
+Workspace for beam search in streaming mode.
+
+
+
+
+TensorPtr getCacheIndirectionInput ( ) const
+Cache indirection input for beam search.
+
+
+
+
+TensorPtr getCacheIndirectionOutput ( ) const
+Cache indirection output for beam search.
+
+
+
+
+std :: optional < std :: vector < SizeType32 > > const & getGenerationSteps (
+
+
+
+
+) const
+Get the generation steps for all requests in the batch.
+
+Returns:
+The generation steps for all requests in the batch.
+
+
+
+
+
+
+void setGenerationSteps (
+
+
+std :: vector < SizeType32 > const & generationSteps ,
+
+
+)
+Set the generation steps for all requests in the batch.
+
+Parameters:
+generationSteps – The generation steps for all requests in the batch.
+
+
+
+
+
+
+DecodingInput & getJointDecodingInput ( ) const
+Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+
+
+
+
+DecodingOutput & getJointDecodingOutput ( ) const
+Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+
+
+
+
+
Private Functions
+
+
+void setupBuffers (
+
+
+nvinfer1 :: DataType dtype ,
+BufferManager const & bufferManager ,
+
+
+)
+
+
+
+
+void reshapeBuffers (
+
+
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+SizeType32 maxAttentionWindow ,
+SizeType32 sinkTokenLength ,
+SizeType32 maxSequenceLength ,
+ModelConfig const & modelConfig ,
+WorldConfig const & worldConfig ,
+BufferManager const & bufferManager ,
+
+
+)
+
+
+
+
+void setupCacheIndirectionBuffers ( BufferManager const & bufferManager )
+
+
+
+
+void reshapeCacheIndirectionBuffers (
+
+
+SizeType32 maxBatchSize ,
+SizeType32 maxBeamWidth ,
+SizeType32 maxAttentionWindow ,
+
+
+)
+
+
+
+
+void setupSpeculativeDecodingBuffers (
+
+
+SpeculativeDecodingMode speculativeDecodingMode ,
+nvinfer1 :: DataType dtype ,
+BufferManager const & bufferManager ,
+
+
+)
+
+
+
+
+void reshapeSpeculativeDecodingBuffers (
+
+
+SpeculativeDecodingMode const & speculativeDecodingMode ,
+SizeType32 maxTokensPerEngineStep ,
+ModelConfig const & modelConfig ,
+WorldConfig const & worldConfig ,
+BufferManager const & bufferManager ,
+
+
+)
+
+
+
+
+
Private Members
+
+
+SizeType32 mMaxBatchSize = { }
+
+
+
+
+SizeType32 mMaxBeamWidth = { }
+
+
+
+
+SizeType32 mMaxSequenceLength = { }
+
+
+
+
+DecodingInputPtr mJointDecodingInput
+Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+
+
+
+
+DecodingOutputPtr mJointDecodingOutput
+Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+
+
+
+
+TensorPtr mFinishedSteps
+[maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token of maxTokensPerStep, on gpu
+
+
+
+
+std :: unique_ptr < BeamSearchBuffers > mBeamSearchBuffers
+Workspace for beam search in streaming mode.
+
+
+
+
+SizeType32 mMaxDecodingDecoderTokens = { 1 }
+
+
+
+
+SizeType32 mMaxDecodingEngineTokens = { 1 }
+
+
+
+
+std :: vector < SizeType32 > mNumDecodingEngineTokens
+[batchSize], the num tokens of each request.
+
+
+
+
+SpeculativeDecodingMode mSpeculativeDecodingMode = { SpeculativeDecodingMode :: None ( ) }
+
+
+
+
+
+
+
+
+
+
+
+
+
+ipcUtils.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
Functions
+
+
+void lamportInitializeAll (
+
+
+void * buffer_0 ,
+void * buffer_1 ,
+void * buffer_2 ,
+size_t size ,
+
+
+)
+
+
+
+
+bool canAccessPeer ( WorldConfig const & worldConfig )
+
+
+
+
+
+class AllReduceBuffers
+
+
+
+
+
+
+
+class IpcMemory
+
+
+
Public Functions
+
+
+IpcMemory (
+
+
+std :: size_t bufferSize ,
+BufferManager const & manager ,
+WorldConfig const & worldConfig ,
+bool openIpc = true ,
+
+
+)
+
+
+
+
+~IpcMemory ( )
+
+
+
+
+IpcMemory ( IpcMemory const & ) = delete
+
+
+
+
+IpcMemory & operator = ( IpcMemory const & ) = delete
+
+
+
+
+IpcMemory ( IpcMemory & & ) = default
+
+
+
+
+IpcMemory & operator = ( IpcMemory & & ) = default
+
+
+
+
+inline std :: vector < void * > const & getCommPtrs ( ) const
+
+
+
+
Public Static Attributes
-
-static constexpr auto value = BufferDataType :: kTrtPointerType
+
+static size_t constexpr FLAGS_SIZE = ( tensorrt_llm :: kernels :: MAX_ALL_REDUCE_BLOCKS + 1 ) * sizeof ( uint32_t )
+
+
+
+
+
Private Functions
+
+
+void allocateIpcMemory (
+
+
+std :: size_t bufferSize ,
+BufferManager const & manager ,
+WorldConfig const & worldConfig ,
+
+
+)
+
+
+
+
+void destroyIpcMemory ( )
+
+
+
+
+
Private Members
+
+
+SizeType32 mTpRank
+
+
+
+
+std :: vector < void * > mCommPtrs
+
+
+
+
+BufferPtr mBuffer
+
+
+
+
+bool mOpenIpc
+
+
+
+
+
+
+
+
+
+
+
+memoryCounters.h
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class MemoryCounters
+
+
Public Types
+
+
+using SizeType32 = std :: size_t
+
+
+
+
+using DiffType = std :: ptrdiff_t
+
+
+
+
+
+
Public Static Functions
+
+
+static MemoryCounters & getInstance ( )
+
+
+
+
+static std :: string bytesToString ( SizeType32 bytes , int precision = 2 )
+
+
+
+
+static std :: string bytesToString ( DiffType bytes , int precision = 2 )
+
+
+
+
+
Private Members
+
+
+std :: atomic < SizeType32 > mGpu = { }
+
+
+
+
+std :: atomic < SizeType32 > mCpu = { }
+
+
+
+
+std :: atomic < SizeType32 > mPinned = { }
+
+
+
+
+std :: atomic < SizeType32 > mUVM = { }
+
+
+
+
+std :: atomic < SizeType32 > mPinnedPool = { }
+
+
+
+
+std :: atomic < DiffType > mGpuDiff = { }
+
+
+
+
+std :: atomic < DiffType > mCpuDiff = { }
+
+
+
+
+std :: atomic < DiffType > mPinnedDiff = { }
+
+
+
+
+std :: atomic < DiffType > mUVMDiff = { }
+
+
+
+
+std :: atomic < DiffType > mPinnedPoolDiff = { }
@@ -11833,704 +11834,48 @@ one more than decoding draft tokens for prediction from primary head
-gptJsonConfig.h
+lookaheadBuffers.h
tensorrt_llm
-bufferManager.h
-tensorrt_llm::runtime::BufferManager
-IBufferPtr
-ITensorPtr
-CudaStreamPtr
-CudaMemPoolPtr
-BufferManager()
-~BufferManager()
-gpu()
-gpu()
-allocate()
-allocate()
-emptyBuffer()
-emptyTensor()
-setMem()
-setZero()
-copy()
-copy()
-copy()
-copy()
-copy()
-copyFrom()
-copyFrom()
-copyFrom()
-copyFrom()
-copyFrom()
-getStream()
-memoryPoolReserved()
-memoryPoolUsed()
-memoryPoolFree()
-memoryPoolTrimTo()
-gpuSync()
-gpuSync()
-cpu()
-cpu()
-pinned()
-pinned()
-pinnedPool()
-pinnedPool()
-managed()
-managed()
-ipcNvls()
-kBYTE_TYPE
-mStream
-mPool
-mTrimPool
+iBuffer.h
+PointerElementType
+MemoryType
+constPointerCast()
+constPointerCast()
+bufferCast()
+bufferCast()
+bufferCastOrNull()
+bufferCastOrNull()
+bufferCastOrNull()
+bufferCastOrNull()
+operator<<()
+tensorrt_llm::runtime::BufferDataType
-rawEngine.h
-tensorrt_llm::runtime::RawEngine
-Type
-FilePath
-AddressWithSize
-HostMemory
+tensorrt_llm::runtime::BufferRange
-RawEngine()
-RawEngine()
-RawEngine()
-getType()
-getPath()
-getPathOpt()
-setPath()
-getManagedWeightsMapOpt()
-setManagedWeightsMap()
-getAddress()
-getSize()
-getHostMemory()
-mEngineAddr
-mEngineSize
-mType
-mEnginePath
-mEngineBuffer
-mManagedWeightsMap
+tensorrt_llm::runtime::DataTypeTraits
+tensorrt_llm::runtime::DataTypeTraits< kDataType, kUnsigned, true >
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kBOOL, kUnsigned >
-loraModule.h
-operator<<()
-tensorrt_llm::runtime::LoraModule
-ModuleType
-kINVALID
-kATTN_QKV
-kATTN_Q
-kATTN_K
-kATTN_V
-kATTN_DENSE
-kMLP_H_TO_4H
-kMLP_4H_TO_H
-kMLP_GATE
-kCROSS_ATTN_QKV
-kCROSS_ATTN_Q
-kCROSS_ATTN_K
-kCROSS_ATTN_V
-kCROSS_ATTN_DENSE
-kMOE_H_TO_4H
-kMOE_4H_TO_H
-kMOE_GATE
-kMOE_ROUTER
-kMLP_ROUTER
-kMLP_GATE_UP
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kFLOAT >
-TensorPtr
-LoraModule()
-LoraModule()
-LoraModule()
-operator=()
-flattenedInOutSize()
-inSize()
-outSize()
-localInSize()
-localOutSize()
-localScalesSize()
-localInDim()
-localOutDim()
-localInAdapterSize()
-localOutAdapterSize()
-localInOutSize()
-localTotalSize()
-value()
-name()
-inDim()
-outDim()
-inDimFirst()
-outDimFirst()
-inTpSplitDim()
-outTpSplitDim()
-createLoraModules()
-toModuleType()
-toModuleName()
-toModuleName()
-mType
-mInDim
-mOutDim
-mInDimFirst
-mOutDimFirst
-mInTpSplitDim
-mOutTpSplitDim
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kHALF >
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT32 >
-request.h
-tensorrt_llm::runtime::decoder_batch::Request
-TensorConstPtr
-TensorPtr
-BufferPtr
-Request()
-ids
-inputLen
-maxNewTokens
-endId
-generatedTokensPerEngineStep
-embeddingBias
-badWordsList
-stopWordsList
-draftTokens
-draftLogits
-medusaPaths
-medusaTreeIds
-lookaheadRuntimeConfig
-eagleConfig
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT32, true >
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT64 >
-cudaStream.h
-tensorrt_llm::runtime::CudaStream
-CudaStream()
-CudaStream()
-CudaStream()
-getDevice()
-get()
-synchronize()
-record()
-record()
-wait()
-wait()
-StreamPtr
-mStream
-mDevice
-tensorrt_llm::runtime::CudaStream::Deleter
-Deleter()
-Deleter()
-operator()()
-mOwnsStream
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT64, true >
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT8 >
+tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kUINT8, kUnsigned >
-cudaEvent.h
-tensorrt_llm::runtime::CudaEvent
-ipcNvlsMemory.h
-MPI_group_barrier()
-ipcNvlsSupported()
-ipcNvlsAllocate()
-ipcNvlsFree()
-tensorrt_llm::runtime::DeviceAllocationNvls
-DeviceAllocationNvls()
-~DeviceAllocationNvls()
-reset()
-getMulticastPointer()
-getUnicastPointer()
-getIpcUnicastPointers()
-getCapacity()
-free()
-_capacity
-_handle
+decodingOutput.h
+tensorrt_llm::batch_manager
+tensorrt_llm::runtime::DecodingOutput
+TensorPtr
+DecodingOutput()
+ids
+gatheredIds
+newTokensSteps
+newTokens
+newTokensVec
+finishReasons
+finishedSum
+logProbs
+cumLogProbs
+parentIds
+lengths
+cacheIndirection
+logProbsTiled
+beamHypotheses
+speculativeDecodingOutputs
+explicitDraftTokensBuffers
+lookaheadOutputs
+eagleBuffers
+kNegativeInfinity
+tensorrt_llm::runtime::DecodingOutput::BeamHypotheses
-tensorrt_llm::runtime::IpcNvlsHandle
-iTensor.h
-nvinfer1
-operator<<()
-operator<<()
-bufferCastOrNull()
-bufferCastOrNull()
-bufferCastOrNull()
-bufferCastOrNull()
-tensorrt_llm::runtime::ITensor
+
+promptTuningParams.h
-gptDecoderBatched.h
-tensorrt_llm::runtime::GptDecoderBatched
-CudaStreamPtr
-LlmRequestPtr
-RequestVector
-TensorPtr
-GptDecoderBatched()
-setup()
-disableLookahead()
-forwardAsync()
-forward()
-finalize()
-getDecoderStream()
-getUnderlyingDecoder()
-getBufferManager()
-GptDecoderPtr
-forwardDispatch()
-mRuntimeStream
-mDecoderStream
-mBufferManager
-mDecoder
+bufferManager.h
-eagleModule.h
+worldConfig.h
+
+loraModule.h
+
+speculativeDecodingMode.h
+
+cudaEvent.h
+
+decodingInput.h
+
speculativeDecodingModule.h
-lookaheadBuffers.h
-tensorrt_llm::runtime::LookaheadDecodingBuffers
-TensorPtr
-LookaheadDecodingBuffers()
-generationLengths
-positionOffsets
-packedMasks
-positionIds
+iGptDecoderBatched.h
+tensorrt_llm::runtime::IGptDecoderBatched
-tensorrt_llm::runtime::LookaheadRuntimeBuffers
-TensorPtr
-TensorMap
-LookaheadRuntimeBuffers()
-setFromInputs()
-reshape()
-insertInputTensors()
-enableLookaheadDecoding()
-disableLookaheadDecoding()
-cumSumLength
-packedMasksDevice
-generationLengthsDevice
-positionOffsetsDevice
-positionIdsDevice
-packedMaskHost
-generationLengthsHost
-positionOffsetsHost
-positionIdsHost
-packedMaskHostCopy
-generationLengthsHostCopy
-positionOffsetsHostCopy
-positionIdsHostCopy
-useSpecDecoding
-batchSlotsHostCopy
+tensorrt_llm::runtime::decoder
+tensorrt_llm::runtime::decoder_batch
-promptTuningParams.h
-tensorrt_llm::runtime::GenericPromptTuningParams
-tensorrt_llm::runtime::PromptTuningParams
-TensorPtr
-SizeType32
-PromptTuningParams()
-fillTasksTensor()
+eagleModule.h
-medusaModule.h
-tensorrt_llm::runtime::MedusaModule
-TensorPtr
-MedusaChoices
-MedusaModule()
-MedusaModule()
-getMedusaChoices()
-mDefaultMedusaChoices
+tllmLogger.h
-iBuffer.h
-PointerElementType
-MemoryType
-kGPU
-kCPU
-kPINNED
-kUVM
-kPINNEDPOOL
+gptDecoderBatched.h
+tensorrt_llm::runtime::GptDecoderBatched
-constPointerCast()
-constPointerCast()
-bufferCast()
-bufferCast()
-bufferCastOrNull()
-bufferCastOrNull()
-bufferCastOrNull()
-bufferCastOrNull()
-operator<<()
-tensorrt_llm::runtime::BufferDataType
-tensorrt_llm::runtime::BufferRange
-Base
-BufferRange()
-BufferRange()
-BufferRange()
+cudaStream.h
+tensorrt_llm::runtime::CudaStream
+CudaStream()
+CudaStream()
+CudaStream()
+getDevice()
+get()
+synchronize()
+record()
+record()
+wait()
+wait()
+StreamPtr
+mStream
+mDevice
+tensorrt_llm::runtime::CudaStream::Deleter
-tensorrt_llm::runtime::DataTypeTraits
-tensorrt_llm::runtime::DataTypeTraits< kDataType, kUnsigned, true >
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kBOOL, kUnsigned >
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kFLOAT >
-type
-name
-size
+ipcNvlsMemory.h
+MPI_group_barrier()
+ipcNvlsSupported()
+ipcNvlsAllocate()
+ipcNvlsFree()
+tensorrt_llm::runtime::DeviceAllocationNvls
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kHALF >
-type
-name
-size
+tensorrt_llm::runtime::IpcNvlsHandle
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT32 >
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT32, true >
-type
-name
-size
+samplingConfig.h
+SET_FROM_OPTIONAL
+tensorrt_llm::runtime::SamplingConfig
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT64 >
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT64, true >
-type
-name
-size
+request.h
+tensorrt_llm::runtime::decoder_batch::Request
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kINT8 >
-tensorrt_llm::runtime::DataTypeTraits< nvinfer1::DataType::kUINT8, kUnsigned >
-type
-name
-size
+decoderState.h
+tensorrt_llm::runtime::decoder::BeamSearchBuffers
-tensorrt_llm::runtime::IBuffer
-UniquePtr
-SharedPtr
-UniqueConstPtr
-SharedConstPtr
-DataType
-data()
-data()
-data()
-data()
-getSize()
-getSizeInBytes()
-getCapacity()
-getDataType()
-getDataTypeName()
-getMemoryType()
-getMemoryTypeName()
-resize()
-release()
-~IBuffer()
-IBuffer()
-operator=()
-getDataTypeName()
-slice()
-slice()
-slice()
-slice()
-view()
-view()
-view()
-wrap()
-wrap()
-wrap()
-wrap()
-wrap()
-memoryType()
-IBuffer()
-toBytes()
+tensorrt_llm::runtime::decoder::DecoderState
-tensorrt_llm::runtime::MemoryTypeString
-tensorrt_llm::runtime::MemoryTypeString< MemoryType::kCPU >
-tensorrt_llm::runtime::MemoryTypeString< MemoryType::kGPU >
-value
+ipcUtils.h
+lamportInitializeAll()
+canAccessPeer()
+tensorrt_llm::runtime::AllReduceBuffers
-tensorrt_llm::runtime::MemoryTypeString< MemoryType::kPINNED >
-value
+tensorrt_llm::runtime::IpcMemory
-tensorrt_llm::runtime::MemoryTypeString< MemoryType::kPINNEDPOOL >
-tensorrt_llm::runtime::MemoryTypeString< MemoryType::kUVM >
-
-tensorrt_llm::runtime::TRTDataType
-tensorrt_llm::runtime::TRTDataType< bool >
-
-tensorrt_llm::runtime::TRTDataType< float >
-
-tensorrt_llm::runtime::TRTDataType< half >
-
-tensorrt_llm::runtime::TRTDataType< kernels::FinishedState >
-
-tensorrt_llm::runtime::TRTDataType< kernels::KVCacheIndex >
-
-tensorrt_llm::runtime::TRTDataType< runtime::RequestType >
-
-tensorrt_llm::runtime::TRTDataType< std::int32_t >
-
-tensorrt_llm::runtime::TRTDataType< std::int64_t >
-
-tensorrt_llm::runtime::TRTDataType< std::int8_t >
-
-tensorrt_llm::runtime::TRTDataType< std::uint32_t >
-
-tensorrt_llm::runtime::TRTDataType< std::uint64_t >
-
-tensorrt_llm::runtime::TRTDataType< std::uint8_t >
-
-tensorrt_llm::runtime::TRTDataType< T * >
-
-tensorrt_llm::runtime::TRTDataType< void * >
-value
+memoryCounters.h
@@ -13682,9 +13683,9 @@ one more than decoding draft tokens for prediction from primary head
diff --git a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
index 97946a05ed..0f2a191a9c 100644
--- a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
+++ b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
@@ -11,7 +11,8 @@ from tensorrt_llm.mapping import Mapping
from ..attention_backend import (AttentionInputType, AttentionMetadata,
TrtllmAttention, TrtllmAttentionMetadata)
-from ..attention_backend.interface import (PositionalEmbeddingParams,
+from ..attention_backend.interface import (AttentionMask,
+ PositionalEmbeddingParams,
PredefinedAttentionMask)
from ..attention_backend.utils import create_attention, get_attention_backend
from ..distributed import AllReduceParams
@@ -67,8 +68,9 @@ class Attention(nn.Module):
config = config or ModelConfig()
self.hidden_size = hidden_size
self.num_heads = num_attention_heads
- self.head_dim = getattr(config.pretrained_config, "head_dim",
- self.hidden_size // self.num_heads)
+ self.head_dim = getattr(config.pretrained_config, 'head_dim', None)
+ if not isinstance(self.head_dim, int):
+ self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = max_position_embeddings
@@ -225,12 +227,12 @@ class Attention(nn.Module):
position_ids: Optional[torch.IntTensor],
hidden_states: Union[torch.Tensor, Fp4QuantizedTensor],
attn_metadata: AttentionMetadata,
- attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.
- CAUSAL,
+ attention_mask: AttentionMask = PredefinedAttentionMask.CAUSAL,
mrope_config: Optional[dict] = None,
all_reduce_params: Optional[AllReduceParams] = None,
lora_params: Optional[dict] = None,
attention_window_size: Optional[int] = None,
+ attention_mask_data: Optional[torch.Tensor] = None,
**kwargs,
) -> torch.Tensor:
"""
@@ -240,12 +242,12 @@ class Attention(nn.Module):
position_ids (Optional[torch.IntTensor]): The position IDs.
hidden_states (torch.Tensor): The hidden states.
attn_metadata (AttentionMetadata): The attention metadata.
- attention_mask (PredefinedAttentionMask): The attention mask type.
+ attention_mask (AttentionMask): The attention mask type.
mrope_config (Optional[dict]): The MROPE configuration.
all_reduce_params (Optional[AllReduceParams]): The all reduce parameters.
lora_params (Optional[dict]): The LoRA parameters.
attention_window_size (Optional[int]): The attention window size.
-
+ attention_mask_data (Optional[torch.Tensor]): The attention mask data.
Returns:
torch.Tensor: The output tensor.
"""
@@ -268,7 +270,7 @@ class Attention(nn.Module):
out_scale = None
out_scale_sf = None
- if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales:
+ if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales or self.o_proj.has_fp8_rowwise:
out_scale = self.o_proj.inv_input_scale
if self.o_proj.has_nvfp4 and self.support_nvfp4_output:
out_scale_sf = self.o_proj.input_scale
@@ -283,7 +285,8 @@ class Attention(nn.Module):
out_scale_sf=out_scale_sf,
attention_mask=attention_mask,
mrope_config=mrope_config,
- attention_window_size=attention_window_size)
+ attention_window_size=attention_window_size,
+ attention_mask_data=attention_mask_data)
hidden_states = attn_output
attn_output = self.o_proj(attn_output,
all_reduce_params=all_reduce_params,
@@ -356,7 +359,7 @@ def fp8_block_scaling_bmm_out(
out: torch.Tensor,
) -> torch.Tensor:
sm_version = get_sm_version()
- if sm_version == 90:
+ if sm_version == 90 or sm_version == 89:
mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
mat1)
torch.ops.trtllm.fp8_block_scaling_bmm_out(mat1_fp8, mat2_fp8,
diff --git a/latest/_downloads/b6815cf245cc7dc7a26a6f727fdc2dc4/model.py b/latest/_downloads/b6815cf245cc7dc7a26a6f727fdc2dc4/model.py
index 60a2f8b38a..0fb003a90c 100644
--- a/latest/_downloads/b6815cf245cc7dc7a26a6f727fdc2dc4/model.py
+++ b/latest/_downloads/b6815cf245cc7dc7a26a6f727fdc2dc4/model.py
@@ -21,7 +21,7 @@ import torch
from tqdm import tqdm
from ..._utils import pad_vocab_size
-from ...functional import Tensor, recv, send
+from ...functional import LayerNormType, Tensor, recv, send
from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
Embedding, GatedMLP, RmsNorm, SharedMoE)
from ...layers.moe import MOEWeightWrapper
@@ -56,6 +56,9 @@ class QWenDecoderLayer(Module):
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
local_layer_idx = layer_idx - layers_range[0]
+ # Qwen3: Enable qk_layernorm for Q/K normalization (similar to Gemma3)
+ qk_layernorm = config.qwen_type in ('qwen3', 'qwen3_moe')
+
self.attention = Attention(
local_layer_idx=local_layer_idx,
hidden_size=config.hidden_size,
@@ -78,7 +81,11 @@ class QWenDecoderLayer(Module):
cp_group=config.mapping.cp_group,
quant_mode=config.quant_mode,
use_logn_scaling=config.use_logn_attn,
- dense_bias=False)
+ dense_bias=False,
+ # Qwen3: Add Q/K layer normalization
+ qk_layernorm=qk_layernorm,
+ layernorm_type=LayerNormType.RmsNorm
+ if qk_layernorm else LayerNormType.LayerNorm)
if config.moe.has_moe():
mlp_kwargs = {'moe_config': config.moe, 'mapping': config.mapping}
@@ -353,6 +360,11 @@ class QWenForCausalLM(DecoderModelForCausalLM):
"transformer": "language_model.model",
"lm_head": "language_model.lm_head",
}
+ elif config.qwen_type in ("qwen3", "qwen3_moe"):
+ custom_dict = {
+ "q_layernorm": "q_norm",
+ "k_layernorm": "k_norm",
+ }
loader = ModelWeightsLoader(hf_model_dir, custom_dict)
model = cls(config)
if config.qwen_type == "qwen" and model.config.mapping.has_tp():
diff --git a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
index dbb4de7ded..cf28ecd326 100644
--- a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
+++ b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
@@ -1,4 +1,5 @@
import copy
+import functools
import json
import math
import os
@@ -222,7 +223,8 @@ class _ModelFormatKind(Enum):
class DecodingBaseConfig(BaseModel):
max_draft_len: Optional[int] = None
- speculative_model: Optional[Union[str, Path]] = None
+ speculative_model_dir: Optional[Union[str, Path]] = None
+ num_extra_kv_tokens: int = 0
@classmethod
def from_dict(cls, data: dict):
@@ -235,6 +237,7 @@ class DecodingBaseConfig(BaseModel):
"Lookahead": LookaheadDecodingConfig,
"NGram": NGramDecodingConfig,
"DraftTarget": DraftTargetDecodingConfig,
+ "UserProvided": UserProvidedDecodingConfig,
}
config_class = config_classes.get(decoding_type)
@@ -246,6 +249,35 @@ class DecodingBaseConfig(BaseModel):
def _check_fields(self):
pass
+ def supports_backend(self, backend: str) -> bool:
+ """
+ Override if the speculation algorithm does not support
+ a subset of the possible backends.
+ """
+ return True
+
+ def validate(self) -> None:
+ """
+ Do any additional error checking here.
+ """
+
+ @functools.cached_property
+ def spec_dec_mode(self):
+ # spec_dec_mode has more functionality than the raw decoding_mode string.
+ # Use an alias for the import here to avoid name collisions with the one for the
+ # TRT backend.
+ from tensorrt_llm._torch.speculative.interface import \
+ SpeculativeDecodingMode as TorchSpeculativeDecodingMode
+ return TorchSpeculativeDecodingMode.from_string(
+ self.decoding_type.upper())
+
+ def update_from_model_config(self, model_config):
+ pass
+
+ def get_draft_model_prompt(self,
+ input_tokens: torch.Tensor) -> torch.Tensor:
+ return input_tokens
+
class MedusaDecodingConfig(DecodingBaseConfig):
medusa_choices: Optional[List[List[int]]] = None
@@ -257,6 +289,9 @@ class MedusaDecodingConfig(DecodingBaseConfig):
decoding_type: ClassVar[str] = "Medusa"
+ def supports_backend(self, backend: str) -> bool:
+ return backend not in ("pytorch", "_autodeploy")
+
class EagleDecodingConfig(DecodingBaseConfig):
eagle_choices: Optional[List[List[int]]] = None
@@ -266,7 +301,6 @@ class EagleDecodingConfig(DecodingBaseConfig):
dynamic_tree_max_topK: Optional[int] = None
num_eagle_layers: Optional[int] = None
max_non_leaves_per_layer: Optional[int] = None
- pytorch_weights_path: Optional[str] = None
eagle3_one_model: Optional[bool] = True
@classmethod
@@ -275,13 +309,43 @@ class EagleDecodingConfig(DecodingBaseConfig):
decoding_type: ClassVar[str] = "Eagle"
+ def validate(self) -> None:
+ if self.speculative_model_dir is None:
+ raise ValueError("Draft model must be provided for EAGLE")
+
+ @functools.cached_property
+ def spec_dec_mode(self):
+ from tensorrt_llm._torch.speculative.interface import \
+ SpeculativeDecodingMode as TorchSpeculativeDecodingMode
+ if self.eagle3_one_model:
+ return TorchSpeculativeDecodingMode.EAGLE3_ONE_MODEL
+ return TorchSpeculativeDecodingMode.EAGLE3
+
+ def get_draft_model_prompt(self,
+ input_tokens: torch.Tensor) -> torch.Tensor:
+ """
+ Eagle3 always throws away the first token when processing draft inputs
+ """
+ return input_tokens[1:]
+
+
+class UserProvidedDecodingConfig(DecodingBaseConfig):
+ # Type should be Drafter, but it leads to circular import
+ drafter: object
+
+ @classmethod
+ def from_dict(cls, data: dict):
+ return cls(**data)
+
+ decoding_type: ClassVar[str] = "User_Provided"
+
class NGramDecodingConfig(DecodingBaseConfig):
"""
Configuration for NGram drafter speculative decoding.
Arguments:
- prompt_lookup_num_tokens: int
+ max_draft_len: int
The length maximum of draft tokens (can be understood as length maximum of output draft tokens).
max_matching_ngram_size: int
@@ -297,7 +361,6 @@ class NGramDecodingConfig(DecodingBaseConfig):
Whether to use a common pool for all requests, or the pool is private for each request if False.
"""
- prompt_lookup_num_tokens: int = 2
max_matching_ngram_size: int = 4
is_keep_all: bool = True
is_use_oldest: bool = True
@@ -309,23 +372,39 @@ class NGramDecodingConfig(DecodingBaseConfig):
decoding_type: ClassVar[str] = "NGram"
+ def supports_backend(self, backend: str) -> bool:
+ return backend == "pytorch"
+
class DraftTargetDecodingConfig(DecodingBaseConfig):
- pytorch_weights_path: Optional[str] = None
@classmethod
def from_dict(cls, data: dict):
return cls(**data)
- decoding_type: ClassVar[str] = "DraftTarget"
+ decoding_type: ClassVar[str] = "Draft_Target"
+
+ def supports_backend(self, backend: str) -> bool:
+ return backend == "pytorch"
class MTPDecodingConfig(DecodingBaseConfig):
- num_nextn_predict_layers: Optional[int] = 1
- use_relaxed_acceptance_for_thinking: Optional[bool] = False
- relaxed_topk: Optional[int] = 1
- relaxed_delta: Optional[float] = 0.
- use_mtp_vanilla: Optional[bool] = False
+ num_nextn_predict_layers: int = 1
+ use_relaxed_acceptance_for_thinking: bool = False
+ relaxed_topk: int = 1
+ relaxed_delta: float = 0.
+ use_mtp_vanilla: bool = False
+
+ # TODO: remove this after distinguishing `max_draft_len` and `num_nextn_predict_layers`
+ # Now we need a flag when MTPDecodingConfig is updated by PyTorchModelEngine.
+ num_nextn_predict_layers_from_model_config: int = 1
+
+ # TODO: Hard code for DeepSeek R1
+ # When encounter , start thinking phase.
+ # When encounter , end thinking phase.
+ # [thinking phase] [real output]
+ BEGIN_THINKING_PHASE_TOKEN: int = 128798
+ END_THINKING_PHASE_TOKEN: int = 128799
@classmethod
def from_dict(cls, data: dict):
@@ -333,6 +412,22 @@ class MTPDecodingConfig(DecodingBaseConfig):
decoding_type: ClassVar[str] = "MTP"
+ def supports_backend(self, backend: str) -> bool:
+ return backend == "pytorch"
+
+ @functools.cached_property
+ def spec_dec_mode(self):
+ from tensorrt_llm._torch.speculative.interface import \
+ SpeculativeDecodingMode as TorchSpeculativeDecodingMode
+ if self.num_nextn_predict_layers_from_model_config == 1 and not self.use_mtp_vanilla:
+ return TorchSpeculativeDecodingMode.MTP_EAGLE
+ return TorchSpeculativeDecodingMode.MTP
+
+ def update_from_model_config(self, model_config):
+ assert self.num_nextn_predict_layers > 0
+ if model_config.num_nextn_predict_layers == 1 and not self.use_mtp_vanilla:
+ self.num_extra_kv_tokens = self.num_nextn_predict_layers - 1
+
class PybindMirror(ABC):
''' A class containing the utilities for mirroring Python classes to
@@ -623,6 +718,9 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror):
self.max_ngram_size,
self.max_verification_set_size)
+ def supports_backend(self, backend: str) -> bool:
+ return backend not in ("pytorch", "_autodeploy")
+
decoding_type: ClassVar[str] = "Lookahead"
@@ -633,6 +731,7 @@ SpeculativeConfig: TypeAlias = Optional[Union[
MedusaDecodingConfig,
MTPDecodingConfig,
NGramDecodingConfig,
+ UserProvidedDecodingConfig,
]]
@@ -1024,7 +1123,7 @@ class BaseLlmArgs(BaseModel):
return self._model_format
@property
- def speculative_model(self) -> Optional[_ModelFormatKind]:
+ def speculative_model_dir(self) -> Optional[_ModelFormatKind]:
return self._speculative_model
@property
@@ -1301,33 +1400,40 @@ class BaseLlmArgs(BaseModel):
@model_validator(mode="after")
def validate_speculative_config(self):
if self.speculative_config:
- if isinstance(self.speculative_config, LookaheadDecodingConfig):
- lookahead_config = self.speculative_config
- # Update the build config
- _, _, max_draft_tokens, _ = lookahead_config.calculate_speculative_resource(
- )
- self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.LOOKAHEAD_DECODING
- if max_draft_tokens > self.build_config.max_draft_len:
- self.build_config.max_draft_len = max_draft_tokens
+ if not self.speculative_config.supports_backend(self.backend):
+ raise ValueError(
+ f"Speculation type {self.speculative_config.decoding_type} does not "
+ f"support backend {self.backend}")
+ # Below, we only need to set speculative_decoding_mode/decoding_config for speculation
+ # on the TRT backend.
+ if isinstance(self.speculative_config, LookaheadDecodingConfig):
+ max_draft_len = self.speculative_config.calculate_speculative_resource(
+ )[2]
+ assert max_draft_len > 0
+ self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.LOOKAHEAD_DECODING
+ self.build_config.max_draft_len = max(
+ self.build_config.max_draft_len, max_draft_len)
self.decoding_config = DecodingConfig(
decoding_mode=DecodingMode.Lookahead(),
lookahead_decoding_config=PybindMirror.maybe_to_pybind(
- lookahead_config))
- elif isinstance(self.speculative_config, MedusaDecodingConfig):
- self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.MEDUSA
+ self.speculative_config))
+ elif isinstance(self.speculative_config, MedusaDecodingConfig):
assert self.speculative_config.max_draft_len > 0
+ self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.MEDUSA
self.build_config.max_draft_len = self.speculative_config.max_draft_len
self.decoding_config = DecodingConfig(
decoding_mode=DecodingMode.Medusa(),
medusa_choices=self.speculative_config.medusa_choices)
+
elif isinstance(self.speculative_config, EagleDecodingConfig):
- self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
assert self.speculative_config.max_draft_len > 0
-
+ assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
self.build_config.max_draft_len = self.speculative_config.max_draft_len
-
+ self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
+ if self.speculative_config.eagle3_one_model:
+ self.speculative_config.num_extra_kv_tokens = self.speculative_config.max_draft_len - 1
if self.backend not in ['pytorch', '_autodeploy']:
eagle_config = _EagleConfig(
self.speculative_config.eagle_choices,
@@ -1338,59 +1444,39 @@ class BaseLlmArgs(BaseModel):
self.decoding_config = DecodingConfig(
decoding_mode=DecodingMode.Eagle(),
eagle_config=eagle_config)
- else:
- from tensorrt_llm._torch.speculative import Eagle3Config
- self.speculative_config = Eagle3Config(
- max_draft_tokens=self.speculative_config.max_draft_len,
- draft_model_path=self.speculative_config.
- pytorch_weights_path,
- eagle3_one_model=self.speculative_config.
- eagle3_one_model)
+
elif isinstance(self.speculative_config, NGramDecodingConfig):
- self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
assert self.backend in ['pytorch', '_autodeploy']
- assert self.speculative_config.prompt_lookup_num_tokens > 0 and self.speculative_config.max_matching_ngram_size > 0
+ assert self.speculative_config.max_draft_len > 0 and self.speculative_config.max_matching_ngram_size > 0
+ self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
self.build_config.max_draft_len = self.speculative_config.max_draft_len
- from tensorrt_llm._torch.speculative import NGramConfig
- self.speculative_config = NGramConfig(
- prompt_lookup_num_tokens=self.speculative_config.
- prompt_lookup_num_tokens,
- max_matching_ngram_size=self.speculative_config.
- max_matching_ngram_size,
- is_keep_all=self.speculative_config.is_keep_all,
- is_use_oldest=self.speculative_config.is_use_oldest,
- is_public_pool=self.speculative_config.is_public_pool,
- )
+
elif isinstance(self.speculative_config, DraftTargetDecodingConfig):
- self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
- assert self.backend == 'pytorch'
+ assert self.backend in ['pytorch']
assert self.speculative_config.max_draft_len > 0
+ self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
self.build_config.max_draft_len = self.speculative_config.max_draft_len
- from tensorrt_llm._torch.speculative import DraftTargetConfig
- self.speculative_config = DraftTargetConfig(
- max_draft_tokens=self.speculative_config.max_draft_len,
- draft_model_path=self.speculative_config.
- pytorch_weights_path)
+
elif isinstance(self.speculative_config, MTPDecodingConfig):
- from tensorrt_llm._torch.speculative import MTPConfig
- self.speculative_config = MTPConfig(
- num_nextn_predict_layers=self.speculative_config.
- num_nextn_predict_layers,
- max_batch_size=self.build_config.max_batch_size,
- use_relaxed_acceptance_for_thinking=self.speculative_config.
- use_relaxed_acceptance_for_thinking,
- relaxed_topk=self.speculative_config.relaxed_topk,
- relaxed_delta=self.speculative_config.relaxed_delta,
- use_mtp_vanilla=self.speculative_config.use_mtp_vanilla)
+ assert self.speculative_config.num_nextn_predict_layers > 0
+ self.speculative_config.max_draft_len = self.speculative_config.num_nextn_predict_layers
+
+ elif isinstance(self.speculative_config,
+ UserProvidedDecodingConfig):
+ assert self.backend in ['pytorch', '_autodeploy']
+ self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.USER_PROVIDED
+ self.build_config.max_draft_len = self.speculative_config.max_draft_len
+
else:
raise ValueError(
- f"Speculative config type not recognized: {self.speculative_config}"
+ f"Unrecognized speculative config type {type(self.speculative_config)}"
)
+
else:
self.decoding_config = None
self._speculative_model = getattr(self.speculative_config,
- "speculative_model", None)
+ "speculative_model_dir", None)
speculative_model_obj = _ModelWrapper(
self._speculative_model
) if self._speculative_model is not None else None
@@ -1702,7 +1788,7 @@ class TorchLlmArgs(BaseLlmArgs):
moe_backend: str = Field(default='CUTLASS',
description="MoE backend to use.")
- mixed_sampler: bool = Field(
+ enable_mixed_sampler: bool = Field(
default=False,
description=
"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
@@ -1732,7 +1818,7 @@ class TorchLlmArgs(BaseLlmArgs):
torch_compile_config: Optional[TorchCompileConfig] = Field(
default=None, description="Torch compile config.")
- autotuner_enabled: bool = Field(
+ enable_autotuner: bool = Field(
default=True,
description="Enable autotuner only when torch compile is enabled.")
@@ -1918,7 +2004,7 @@ class TorchLlmArgs(BaseLlmArgs):
moe_load_balancer=self.moe_load_balancer,
attn_backend=self.attn_backend,
moe_backend=self.moe_backend,
- mixed_sampler=self.mixed_sampler,
+ enable_mixed_sampler=self.enable_mixed_sampler,
enable_trtllm_sampler=self.enable_trtllm_sampler,
kv_cache_dtype=self.kv_cache_dtype,
enable_iter_perf_stats=self.enable_iter_perf_stats,
@@ -1938,7 +2024,7 @@ class TorchLlmArgs(BaseLlmArgs):
torch_compile_enable_userbuffers=self.torch_compile_config.
enable_userbuffers if self.torch_compile_config is not None else
TorchCompileConfig.model_fields['enable_userbuffers'].default,
- autotuner_enabled=self.autotuner_enabled,
+ enable_autotuner=self.enable_autotuner,
enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
load_format=self.load_format,
enable_min_latency=self.enable_min_latency,
diff --git a/latest/_images/disaggregated-service_usage.png b/latest/_images/disaggregated-service_usage.png
deleted file mode 100644
index 6b98a223322753da05e305f51497fd3bea2ce4ff..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 39888
zcmeFZcT`j9+cp{zWpG3sm>ETBjucUf(mN_DAWc9(I+lRc03q}w3S%1)lq!Ux0}^Ub
zIs^y|4hSKjv`7sKBti%+B!SekqrdN*?_KAgbI$tCA7`C+FI{f--q}34pXa&DbzQel
zt~ppq95{9W1OiFeSYL7mf%g9d0_{Ei(|+K}%fs^iz(0GS&Q=#em?7C^;0Mvb3-%X4
zpm*s%b8qegeio0g_JD#whkAs6_w+%^?|?uku+60l*W-LwdEyNku5c}OZKgM8{`FIX
zW69-WwY~c-&-^rZc;EHj^CZi?x1{@e4lPBMT)OblFm*)D(D&e}(|44v?5o|m{_yC(
z`xO+e(f+sSM;1@VJQ?|`L+i_MY>}QMY-P+dk=96DbAX$t9&@d7)2j~{uk7LTD#_kZ
z;;eA&Akg_*?oh8^V-pb{R3K%jewOhpjr#vV=VYTcJ#)p7aOo@#EM
z@i%7Y*b~h;RJGfnd!0X?@a0Sq53RLPG1{^bU3+?3N3n|;w)&y9Av=
zz`KCsp8S6~@wcS(ZuUDOqTWu2W5e#U;InP8H#KWs;jM3yNXjGwVt;fP3EN7mj2SS+
z)GxOAx(5;r=HeFSf2)oA*PmL`Ja@O4Qk1wB@#NV!_=*fsB|goB?ji{@$sFrH)woKa
z=~1g})2b${bxiV`BoK-v*CG_6HY@=PV982J{gM1--1K2
zgb)+t@k9P?wV|L?GX1ET6Javld}_7
z-&q;uj9HFXXGV69H;FccOroZ%7};&1_|3%@bKbI}B&C}5S`vQu?XWo~)oiX7P76ta
zvu2ysz1|Gf39?EGm0sc+s<9mQZW7cOZutlTFzCmi3+smBiH(p)k;M8aC*=RdAt?@
zn`AvvEoyPftFOtT6s{*x=D$cHyB4Sjk3Sl0lMniq#g%vd(cn7)>XxcRYepVz&BKbGHnJp-^7X(0K4p7`ihAu9w~s
z?;Ex1%IierMi%CtAj*QyldKPeKyNRIJQnZLD;^wj8CETfcFyB}v$F8)brs2s{B-$9
z_EOVi0BXzV!kp;zkSTp|E8Yz-bi34fyXTBQr5!#Wwo5a8&988TSK8tDj9RUQ2i6$6KDT2P+|$Y$Ti>M=lRR+ZfAtx!VV3UeCla4qTj
zKgtwUhHoua*3n$`gM6mwpri$NL))CzFgnmup(5eS9?8wwOlla0>d{&~C2%xg;Pt64
zeuUe#6y+z+3LHV8D?%sp`QVb;COPvR;buWensKmU<2xR68e#}b7nPU(XdCfPCOR~D
zKQ{10_rTF{?p2W#aS&yB;Q{dS&g;Dk$&JJDL(qYpM;6?hGU+aO^U5@rN8-V51hyPQ
zH$M4qacPkz1K%rN^JBAU5_*{&l7d#>Mx_{KU83cz0HpPg)VgJff
za%7}K`{ZJ?BB#98ryL(99U%$$EPPynCsWLP<389l*M@2!N4VntsMaxIB=eyQs*-Ad
z`!y_cKB}l~xb>V#qa4WP0LEM6k`%Bl8i_VR;$Nz%WTRF6?xYIdg`MlFM`o_dl@_Bt1Dd6IW=u=!*p5IJ4|)9-)U
zEA-uE87;RB+Q6AwS%=x%c#UTDg-H8T<2TI<+Y!5XKY8FHZK@GGv-Y7O-=$*nQ5C@vu~Lnz>xm@JK#K+#GT~m2zz)b6O_)9@KL5?x(|Wda5E!*%5{_QD
zF552Pt!q)(!Vnno=E`n()Gr_q{wBdEzCHOj+RLBUm%Sy}U$c0#`SK!qOqFqv7d>TwzGR{;Ao-Rm51&g2DYq37|JSfI%qF$f3x%}
zM=Vu*8{E;LTJ#Fh36ZyLLM=)A4@!Sp9&BtP7M&+Q_2Yh(8XJrys1l`<9H5$N6u{5m
zZYGKwl9w4kNN-!M>hPU9H_ROIMHr9P$TgM8qKguOCj+n!SNh-4S{BmYtkP)8dp0WF
zJ1SSt5Z)+#lgXTb6y-RaZ>Xd`!yf7_q$kY85118-#qO&?SD-w_UcjiW_)YS$yF{d(
zO*~l}!-vYXZ$5*lb3QO&V
zER#>1)`iglA6F}DJ9JZm-zyVX@b1gYHq5fkfKKk1;PMflF%Po0ScM=dcfs>?Bkq2d
zYM({%Vpo!IL-asQgx2VPSkOambRq@`%Kmurr!16%I9tW}1Jo7#2LM)djY&llMknGc
z4eqy`9?fi}2YR3olX}av`{i+yk82%xBdtx>7_@gTp=T3!SH_0mbg#k^f%lrdS(E
ztA}DTBbkM}CC2ggT}(E$Rtr49tg1$ZfoFF`DfOWmR?x)8RiibwXmqx5b>Un6H>(x6
zNSJAjl8SRtfXp=6%OL)b4s#7sQoc#lpk8^wtN0>)NPeklrgdK5I5I8pUGlb%vm_%S
zi4fxJ();vgJ~d8{+vL(4x<05gJhRC9G~>rH$R4)&T@Nn?`skj2)N^0^=_?BHze)Jh
zU#LQR1Tp`VjkLQK-5GY-puL~CN@t)}<#4a{$^ZNd|4#`v@Phx5a^C}QbgaW6hWp-{
zejf(NUqvmr;1j?Al`|-Cnq(MJ#
zQvC!8KmpWsRfiy`qSfIwQ!IF+78;{puT)<`uWBu}ds-i^775?p{-lFkf0}~V!X*id
z6wuwaVfdO(T=&nuXAL+1(s%^*UwW>1!%X*_&_JN7OHz9i*IFs_%3AUL{y+@qX$6gL
z?(aT5FCoJI2H|sR_v6^(wGWFyHy(55Se-p;K;4tkZdAv+)(|`js4V#n#`l1oa`S)RZkH!
z`*s0%xbDAHN5YT&*Lvyy{uNIEmU}o_eXY;CCBSeWFLHn%ZV$ZWZ0?)XRfOqDa^!8AK6dg<9{BYyivl`C&Z-y9|t0G
z>Z7p6;dx~&B4%U2OjBXV!^0WV(PR;28KtPrQ?*D-x)M
z>Aj9~UuySLn-Bx@eAZNeVc?SCcBDY}GlC|K-*!iA^P>(qbl!Q?O+-c^@^aU=V)jJ;
zSZ$yiZ{#_z1of$zhsz1#vrPQ@A^zr>VcD7JzT5$du9>LMkL7wVZ)yc0Dc2rJLfLw4
z2j!!~&=!VwHgeG1_9UF34JR|y(dWnQ?-T1Y+f7{J9LyVP4~@+-RTkuT;2LihhFTD)
zra_#Nhqea?qIsRRMAjr(LBPKNPMGGbw|wCoB!s!=v2qiAo4@R64^f=(6JeL<_Lnnq
z7$Xgz3@fmSyfq$vdzUm5%^{Mu^0dSC%|z{gY>_UzC2{63A`w1sVNY*JSr9Yd8k71w
zB)xWIZi5IEUeZ6?>{n7Oe#Ioiaet^MGUgJQz!6DOB&)v+Vhn5
zWSH+oa1?OreGz@N$g?}7W*=>C({xJn=Ze(&q?LABr9y#OfoE~*J^ddm)t@|aq+6mH
zfw2>?C+D6V5jc5J?2m+bv29y+u;}mO?r^t&Z4^%9Zj?zk)Pe^hW^wTH!F192%+kl(
z`=0ZNQH#(;PE2yiR3sIhvsK2G8EOl_eK9C8`?e=tb3y4uTePyoD;pqn29B3EivFx6
zeARuVNrBTgGJdiKSv?IEwdjav6k7wcFs5#RiBBBvS6h?tT?MHJ~7!R9dT
zTtV{gf7avgmy>52D_$$@FJeDtbhD$&rw@@WLaS7BBa0^152OiyQTkMagO95HZyJjl
z_dTq2E=JDc3cY#(NMqqIZN$cUjUR^gb-lp2Aonr4?gKe>$68^pJ**Iz={|6~cRr0H;2Ic6qV<^_CYaL;Qxy@@
z7o}uzvFlCU%KR$V?leYjnVhqkpWs}R83Yv9P!
zqEBf}esFZ^I>Ur@W}zv-0})4Yf9Mh+tn`7y|5GXS>(p*U5r5c4pf$Io(*s$5v3~0*
z$FwC?)rmM6@OkXxt>eEvhIHy}+)IR*6inT)zg1eYPSq}S_=}zeFS>;T7ISvIjjS%JYj<0sR*NUOD3Nk91{=Nc
zwuUK#sQ1l*_hX9X5T07p#pRtyYI{^V{!#IS#j;=Vyv6F~an^%~Zwve&OYM@HNmIw1
zs(hP=Rfvcf83>N~Bqh*%WqlwzTYC&4V!m5L%-K{gHK%(0Y7)*HlVIc}d@#>^+`owc
zYT(oK*J1w&y3eTl+<^t1LfxGfLg~|`(2}YITa73n6
z&a%=G#OAb@fja8al>u2=TMam^4cP1`JS;4c2IoguPHJM8x1RxN(RsluD_&S$5z|{m
zCSqr(c6Vp65xn*eOi-ZX1)sZ1H)=c;7NAjK`_OQ=k&9nONFoU+#!eXl(XT?ZGvt*SgDJ!(;C3f2HWg|I;9%1)h9~*AL|5p)
zUr4iHWG>X!4QGKXGWS+%&MDs5*M8x=Lci|b`t85m+iz`PVmm^1f?NRbFC<2x=ou%C5MpKv1MA{o1m2h;4hga;N)KN
z6<$_iJUsgpklgwA(vIr5bR}_l{>CFo+)b%m--KoVVmQ;|oMUHg!`7gc)H7fp@xQaz
z_n$QpW@2BBs~#Er1ORlNMV^jP{#7+ir}uDCkk0t$l#|-t5#9U1{Bd_bv$&q)b7#G8
zBj)HutgNv29)X3fz3?1zU@&ju%QT74oG*Y5sWGyY{+SG{g`l#tY55@-wFPgYFcW?o
zFHCgw6UFUt!eZ8uWl-#m?A0kc*YfS}rvux%1X9vgX0dwr`pGoIzfEWJFIi57`c-WG
z87RWE<^39-j1Ri?jY}uuFmNG`cJ7Q9{Nx>UVa(~^+N)CmyfpEvTZ)Fd?4C)5`Cxvh^c$N&EY<~=9b2yLrrzK
zriKee;5r9aVioVy9d8QwL!?HwY+~BGwZxfsm&g>3x;s&ZxEbB)=qLUO(_C%f9;Lbz
z65CtAw&{rJH?Onn97ZVv*t7xl_ERV7AcgkQlYG#X=lzJeNrt9{jqvnqdty57%$2q=
zQS?lM#48?uL2on{Gx;YWx5i*A31M`jSfkFQVrnzhvz!#jds$|hdE-O8W9xLE$(m^c
zc(z(+ZOTF67*xYWZy36w)#rpC*BouY)vuPc&e{>F9YrN&o@H%zxotI?4DJ|MJGE*7
zgLKrMg*_v#yZ?hb4(j?xnCN!6apVmbpIZ*ZIa3N_ePtBj+n>O)LBXdS`-EVp%uuUU
z#;-Oii(wCA-YRRhZ~UXp7XlK!_OL`P#O71HVcjz`=I0X8fRRC`Y#w^}!D`6w1UXte
zeW<76LDl_=sX$8?kvl-r54Gi8Ur191qA|&UWc`6%kr@iPu4vX>71IIk$i;D&l-fVc7+p%uxkXs^R^K!VQRx)46*B-oQU9#-6Cwe1Ppj4o5}(J
zdrrh!f?6LPK+(32qZ)jtm-60U98Sc1VQx;)pR-@u(hl+)Mjg&9XioaB+{E#i$A|K!
z!MQ$em||J0+#p0?#iKDR1eQPxFs29WPFG8PtEHeTCL&GpUTBjG8rGlYVTe0Pe6zeI
zp|QUTV^4zG{RYv$q`8lY9cwVKX+?7fEes`EJTi7I^?>I%NRaLB>(T2v;GCkZ=kzgp
z^WP)6kx)M&V&b8CAN2!N?*fuZ(OL)N<K3yZ{7U*r_G?YM7jiDP6Z90deU>%;FA6O+KC>zs;^qQP{dCSqHbdBx~(FfRx
ze)D>V0pg(pF%`V)cgj~N#$y4k85=g&be$z(I2t=t>|GUmI%Hy5i8$TDD?7PmxcOe%
z;J{tYB_lfpfLE~=D8UL363ow+nZ?#37*+UKqoII2>CmC3x!s}I@Hl9&`HwjnD1c2-
zUaJJTC_wx5ZXkd5>CYRpY067Xswd^OmLju4o8yZ@M(>UmRYeaJMWwYd>l{pi8V?&;
zh;IiX32D11iYlen1JYVtn%b(LFDM$LXWS0wCpzP5afPYW_Jj31v3o%`jDg(#e67-{
z3`NgmI756?G?qO2ONhu>pKf%($4Y7t{@6SOr+kfc*!rW|{g=!(aYHbcP?jMDvb(0z
z-4QUU<5PSr%)+eg*RAN_`idt9E2yHSQK91J8&j+rN53jr<-;#8#?^;P^y=+}G?NgC
zGQ|=9h#w!fO^&a~6zyGc=?&Y}oUWm0E)?LM3QrM-cRS;ULUyBg?|)$Sc)c8Ly>yKw
zQ)+enKs3jc!1^pTV>?Th$YB~!21$T>IK#9Lp6bt5qy#{75
zqc*A2kQL<%ADNJZis3J%fYH^Ev$4q&6(rpk+2^56*|)vbe>PkH1d|^!Tu6YZXz3D#XIj+2X@}}96GbG3lMd|Q&891j_AiE7;U_CqI!&;i9DGvM<>!V4{=V~
zkWWKkmj>pzvw*qR%TkRjfZ|)2;eUG|TDkRWq{y}STRKwTY|d7lIjTD(X0g+)y>`HS
z{FSSFB2&Cc=_3hYt9ZX;5MUR2@D%cBsm7r^Z8l%dbXM>__J3->=I4qc68BOyX}>
z#!iOeQP(xJ1#cOgG-_{<9qA(C8}Hd7bIqn&d?IGvncNgw|l+L_$~MXiMU_?#gZj!=Yy|rRP3*jy0e^+pnsn)
znwo!sBpNMs)Ye1Fn$4wHSa1W*?tCXq2;>L(b%jdi+DeaePweSf$_TK_x3?b)uy-c;
z^hE#;KPNKd58A6ca$9Z}lPD%6wS_p|hT5!YYoUG*74`G7VZ;6jtkOLL2LMfp8sGHp
zeiu!aA<#X|mTer5IfPYWh`OZPRS?RM3|
zW(`cKEXRAs-7L{+w9UcmjWkwhtz2~f^H*0zbY`TXCv+58C@-WHr2M?
zsgB@5#n2Pg@o|qxxa}+by)RTl
z>G~JL5Pz#2z_fh*D?`aTqO2TS{^pKAY2z?hg94%aO^7F>4k=WXAxDmfT((E
zLG;bkg?{G8L(XNInXCQWHoFxi#xD#TZdt3;sm7#6Z`ce&-`WL!EcP|gU9b4|bQE3#
z-@;>FA)c+;x-9WWS`~^t(FZ{8uD$hVg>z4pb-4AKvbM93uElDe5p&jBPWR7&Jc;(e
z(C3s{gc-+7Lwsc5hv+rcQI-2vd1P<<1e*Oxo3B9T5}jRTrmJl8Jb)TC+{&2n!m|iz
z&*t8-&YXSY11$xM*_4_mJ;Fovr%kb
zaoRm9T(NMNY|!-CF+qzMukeP+$uMYI@$%A2`1EujE~m`Moe%=KejYi-yUU?9Yb^w1
zLhS+mcqAhDHiMhmHsO*MdC(aoe}BY)>T>amFTgW%keXp{tWm9yU9g+>ujPal_mxx5*+cXwT1!UN!4amAn$!a#j)`l|
z{C17!uuR#<(dJ&9lXtZgGP44u@US_n3fRkL?2wS-_SrjCuS;vKB>GAn^db6U7BqpR
zFj0n*q#Q@`)+(k$E)mVv9%v=qlxePIeBkLV0c=gt+fQ)*e2Szdi!5u&;TOZ|3gbRl
z$~*Ut-x6T=fYgQP%Bv&2Fm;bdWujNLYxgHg-LU~F@A>=Kyn>`4yA%(&9U+k8|$1
zHMtkN)n=TvK5R%h4AaxfTB46dBu9u_k8hy`DWbWVisjvJY~DyyvXrMoCtO!<{w{T@
z`hFcVvr?EA*}|1PM$C(jnc~~@KEGt!)*k`vEWNtoNzs7B+sYWcnag-?{amZNvCUyc
zqpv7xQH@uYgt19g&t0nbofwY!gQR*Mdu>$KYog-9S8^m6$i0b!s4{zI$sLUxlUi7#Rvp6cc-?(D>
zn14<1ZdnE0sm!P;$w46Pp$hW`^JGKvtAfQ?0k3oPmV(!seV)+0>ge2<%Xw
zb5YM}+$hs12({j7JEDzyorQ(Vm0|=!I?xWCR~Qo&m9b@at96KTsW*
ztg;QX(OHfEnbZVTdp0^gAhwhBzz0EHi%LJ;BMj9ddDQN7BzNJ9D&SMyZ4Qm`ZwAZ`
z?+2Y%0E8>3XG;5p^097!*q(FUO#q;m%nG@FVj9*FC`FWMETCM6p11(2P9VJRTFUkF
zHQx(*yK9TvyL>73oQdjw(C32X(zxF~P9B8o+63y`hN>ZVzY$IAjTnJwhVMM#vsXTMDt2
zd#*QgePU!}fvS1#)b|(20GP7dZ%@91r7r;w;qRL9fQ%E4MucSK`GcJLzj#v9N)7J`ax1^B>8(?c
z%r^uhn^lCbxncL6)cjvSO8qZqx&9x&P~R4qh{=b*1Od=bme77}+%3Gpk3~}U13L%3
zdi4LzlN)*tlLCzPcQ)($It9JV5#4iJ`KSP(uMFbwE{M((=aDm+IDKkdU50uez;*XGQ-qH
zPLADzJT~OvS;0|)v^#x&k)37oJqP5bs)9l&fNF!;R0|_y1liS5UPy9iC$04wNUrUS
zvusEFYia4StGG4DKomUlwxirE1!o#z7#IW=*vtth2PAxxed+Qm3qCJp;5$*+ZGl+I
zOA)}Qm6xF2aXo&L4(&VXS2GlgxF1Wzyk^&5gO3S6cyAn#xqK(3mnwK~vI3x6ceq-J
zMEpEJNtqO2E0m;sJE#8zv+Ga}3Q1g)pu`T{kJ7dORV^`peit%r4v!EdhUx6kklb4l
z%-rH8r+-f1*O`wVNj7x|+9d2{c;AL<=Q;-eJ>Dd64TX>^ZCxMn@Y|rre!?cumAl
zze$$`xQ2cdK>Ve3D}=wJKV%PdA|oid>ll2={$}157VOIaNKV%kHAtE_x6nmwIB9J8
z_{!IZUKR#%>*J0Zqff)U~Q-+Y!PB6;OHmg)S9&ZXz|Mo7BpC3O2ne
zLS<+qKMG?k0;<4!Tt|5>w;Li@$OTWDAb2UyM79WsLkLbWPJPdsdhT#Zz7~7kq`fTZ
zh=+My68qA(6+O0mx~&_@Bn9TVH+5UD)B)a(4#x7p#vb074!bThcC!-whv|R$%dG#j=d;NB*EL#^Lx2^hc!cZnCP-;3v2HZ<10J
zyH|STk>PnmnQX?BT_@i~7Nqx;RTTTd>O}hJAKO{=|5~XbaD26|z5NPb)>)
ztj}w2QlNsReLY4jQ>VGPfE@T2X>-rHYc0!ZS`tGow}!PXR?MX?;pV-NdM#@tb6qaE
z2+cVOS67PsWLgxP6qG9gCY-xmZ=UZsg_bGPB*04#Qn>h=$EB1AZ6BfxX0}?NH5I?V
zZ2xC;;ZySQ-)QTVp1sUYm%w58jF|1;iV#08DR=!k*~&-Nxmd`)yeS}TY3>7}7jSfa
z-P*tB_-;p{fcRzX4uY;=p|v1N!FnU9fd&VBi6E%RnE3lXSBo9avy8nj&d2B&eayOZ
z5sLEACkQUVIVbWtcU8syxWF@&QE-I6KaSto!(#Ka`2goUd+KNr@7aNW99IOqp=o{5
zZzZj~tT!DzU5%_RMD=t4FLKZ-!%XiZX)d-*M{A3Gaivn@$;*7%$8YQoeYE(_+k{M1
z%XlX&zv(eL>x;=no6&-L9M5)8G1%~kZ?7Nagj61aM|Pe<~fY0qtSPUl9l%^r(R
zP`&onKVK+N>Ar?Kk??V^yiYHE^G`7aw_bqUp_$M#L?3&)+gq?F-Ie#LYyg_{I?5qSYJ=J&Xa0a7>^!JQV)=0d5GkQ4?r5n
z-BO@KMu8D@f&t_wbswNhEyePXQK@L*A@iO2FUNk)yyq`63PTdkyD?)2)X-T=QrF-t
zEXk)V&tWJm`RxmOT)%Q|XgRfjGKL(o6!~nVr9QYvO!n<-1j=%ygP-<0^;to7Wjta0
z1a3Ehw&c6PY{tP`bbpQ{=n=Cv0rpETZl_1Vw+sk->`Te_o|UQHfiOOQU-ER;V-0HlP(kcEIdvf8IVJ(D>&8H6s>H@#o<
zYVIQGj}I*S0Fwsp!;3P1zPx0B)``Y)O&C*}BQ
zKR-s>6tMr~u+~4fh^MuVp{Iy*EJ=LaxiR>Rz}G8xQ#a$(&IXZRtKGruoR)~;^z`7P
zM>~+LJv|l6t1S%n;(Hu_M<6qKj&4{ow_VvON3*IKBYk*=whH|EsXj
z3Dx;^`MBa5Koe;fP4uf4d-nn&Yr$KmKmG>3SG!FFAclYDllJMm`aRX*nlZ!Z>I#du
zjzro>(K=NG_7LylOQTqIVKMFJOyP
z?Z3`fr0-C*&FrmufUukbE)eZeH`yCr$+7lFp5WY#iQvv_ti}KHcK>{cSl?)++!^G0
zTGc>HkoVRJ!r{yHcJ_n?kLe--@h73$PGYHfuP7cGOY
z+DyG4LW!0X{b?_)5pUc%*&R9cX+1%t6M-B&*OxUcLyVW{-8Wp)6j}H>7*l-ndV~(W
zp4a9*VB#*VQ-LNM%WTDW${8THOB6H~O49NBPZ>p?D43&f&9F2`kP-)H)nKlIZM0
zWM~uD>ui}XU=$ZY<_xLQX=$(ti=ig@p>pur1nOBTVf;PmA|{==In4fi%Z;MZhNxN?
zm@y@edP5r#E5ZvI{GaAHJk)uq2pi+Cq)fE`;L-gCtHYOngdzoRYZU8&2fI;f~0n@G22`Y0DSliQe
z;-%mx?A?=SA%15TxUX9e?^^2HJk+(#_|r05yR3u9{l!=T&|T9bf)jig84Cd03AYWb
zb{+6Cfv>SGjgZ&TDP*$oi{U(rVvC&G_N@W!x!dVx198VD{WKiieJI{(2ZjLeA-5#S
zWyV+bX=^HedgIeK88h&_?mB@@X!KIve9^4*^RBhe^1?DX;`x(-D`)>A{yLm
z1_DP;q!(sZ)OfVO-Idx_HM-`6dr)0WD@~iCpyM4-!H>p$aK#?JxnTw~5tcFrnd!S>
zi)~@Vf6M99SyT)U8z*d0j2@TtEWmc0p~-
ztyqllU*2Ly81lO-W|W1s`ub{g5Nwu+ZdE0)NvM~ldaBJ5}*mzF5dWiL`1?{^`2M=F6l;cIBr*fKoF1MII*Z|)+R(&
z&-+%=q+{F8>)2U7eqAEBor)##;-lpA^QquGb+2^g
z=?{^_^|mosUj>Z&*vj-)37fz~ks9S9-~Y0N4alQz?P6>@@n;>($XtisewK$OrCE(0
z3~!>gMyXMoCRDG5^1znWi_Y%AUDTSv(D>yRX
z5#!3fXm12nnIC5q~KnE7RA
z3GsHp2AUZ5(P^uEmS59*VAYJNrOy4bEl5zFhO~LUu=S+vldSeImuY2fI
z0>@ae*~k@y%R^0uXD|_*NMd*|ve%5=U##n-x}?WnPgW!m*{f}uE9O_m8RgIyv{X;@
z{@5w-O1}G>0vc4C2FrurK_n)=pidm+N6T4~VTGC1D44eHIP`3A@whH5XGzC`=-{3g
z{4snC(m8Qi=dnk#QDwD9Q;65XokW={c@1eR@iL8eW)!2fvVY6btz&@v1AUGdiB|Vv
zl|JAmj)E~!*BgZ%r3yw=URtdj^~`Z%_~21nx*FT;qcFY|Z`$zt6hF=Cs2UudORuYm
zBzmu5ee!|66EZ3vEW;2)GG-cBsF2VxW1pqk>1AW`hs{RnIehbat80@^uonnaa`)Eb
zcb##gCi?}fB@+o>L95uowAKiF86keLf4zfUaT4{W-xxM&Qz}Rft;b}-J7^_FAV~8-LSFHL4CSBW
z{DcyC!0zDF(2@xoul!$v<=r(xTn1jo6KH{j%qUl8Cin_`xNO9rYnC-C(KCzW<_v|<
z6XNqlkB(pJRu@99Jr$pVDVRrcF*jtI-}KC41s`X=jWd^`-v4419rL!JENG42ld!Wa
zf5bQLrVP6Ek>;|h;+4iPCLxWcj}$tC9+6*^#RYrdHlHzgFLh~cA+dha6`4QCZvKer
zg?y_I@$%!cA4iM%+uEJWWeft^b&_RezV$-8@K8z!)H1LT=0<7Fuho6>G1Rj4P~+~O
zHBD2&p?%XOD{PP9gb9Gpnsi{Vd~@cQe9MR`GkT)S7ztfFL8*cd{hr`M{yfciLA9{A
zS~Mlcj=&D)5nUajdVBO%+N#QG>f@Ge{iZz1D5ex6u<43{AQ0#S0~e^6Px&$`-LqBn
zqcrnbL}5o{;dK6kNZ56x-$G8m(Xdgdfr+!m6*Iy=N$WOG8S4Zkh6RDAv1MXkuh&{s
znWaM;+LQOZuEwEXyIyu7GGNF
zJ#!b(Dp;J;F!SKT?K5WwlGa1$y9VMQe@#GQqLX8R7>y6(zjC}b59Ghc5jw~aB1U7u
zsk_(_zT$FRSykxizZkH}3$Zc#$Cz>4r|w!?P*(92r;Qf(X$FC!YT%j3s(wgb$sPsD*uWL%ORL#oh8y3G+b@1*>Kd=IsXA
zZK>Xc!Nm5;o@;3!9!G8L~g5%41?woxmz;xFMLca~9A$a2?+p
zT&IeXG@!;MHTM~s;>XfmYEwKh9^MLcUUOt;U(*drSU%@d194`r!cLcV(VP5xROjKq
z4wAtV>Cc(;sF{j&+1QoNiJPNP62>|Few^S$eKG;Z*=yh|@lb?kS%
zByy+c%h=Y9REv$R~7)R3iIp}KoaXW3C%(i1!^Zp4atyYxY
zVg5V)vx*Vq=T?Q9KapbS>`TGPZ1K+&MAm`>oYI1o5N
z{NkRc)R9IErh_>p^~z=*y_-d;uEpCZDPG^3EUr7ga8>3=UGaml$y?TEHM-wZaD}>F;C(bLxxy9ML;-AA&lc-CiZXsB
z%HNm);_-E^MyvfFM3+9%X56|*lOqyYhJJ9QO
z+*oTySneqBXcpY9N4JUsEOr+&A3@igQs%@10`tgiY9VPS2oep&57VbKW137(zde_G
zY~zDMS4pzLXl0?4eDuc)%8Da1h#^}MvG{T1NS$h2)QF90_Ohv7p|(NF-bn;Alm49k
zg1!aFZxVNC{t?1%zDS^4O{jP(>ae90!`~^+wplTJQnrfqt#{2C*2@l2Bz$Nf4(r#?
zaex+XXR@bN9t-HcgtUzH{!TB@x)j#-70>}~T%qVu)F@Tdpf3f@+zIHjMr2~1IgsJw
zqgUSKGhjx!>_4MKN@-9OPxW4QVwWzk@&6SHG;>vhpc2hg&A!6h&mI9?$l&&@jPp;J
zMzxGAl6(8IJ>>MllI~Lu`_#|7tbh~bM5JwNRJtQwL%gYJo$jH!XWK~*6xc!D94C1W
zA;+k%JqM__^KYY;;=KkrPPyFU1+*nW>z2Tmus~oG0<*Sl_XmrLf#$UR#a>5@EnJl&
zE}djNj{;EHrrkLrfJq2c_J9U215V-q&^wQ^J*<1-EBBMPNf!Qjm!JKiMaBku@rG+I
zO80E$t5CM$2mY2-IRD$Q0uyE;6KsMLf4*4JK@M5nJ^3ysI3Ju(Evxpqzjpfm2DckU
z3s8V{a8H<2e2>}%7WYsL-KSmGi-xN1zZSieKRp~@!T4k@wy-F^JSA#5P+(j+J;(%f
zSKfep%)HL&l?A8YC_TW3$Z#$y=u?6R1Z5ZxnWKoI^p{cgmy>T07HkeHIbJJEb0NB}
zoFW`u#A5D!x>ZDfP=3F3>;k==Y`T=x0kq)CXvU|D7Odz^@M&P+|G@|=bEy2cMYFHp
zrDW2#Y1+Jb5-AeZhHONuCh(u)+!5|O5$A{}J~qz37|Du^N=AfaQSmw;5M0a1_|q=g<32qlCT0wf{i
z9W%K0-sdUTdC&RsUhnhl`9&~fW@gMWN4f9+|97vKJ7d7j`Hg`6?vnJ?-#;EKK+BGR
zIV>xcWS(YePgS+REYvg*O**RiJfqPhV9~&|%CyRL5Gxb6K1e9$R4kDLi*ai>N;e8)
zd&2RACT{MNMp~I4wk{fT{r2J~iHy^vn5S2|pNxW)MBmQeGnv>7S8v_bbTuVnxU<2o
zdWQhq*0r+XM8j=z^F=}@E_#>6n%HV#z|8HQA!fL%4!@ZdwWwb}Bm(l{EdcRB`74_~A5ZK*F=zv)gyE&$rrY~*
zMvgIPZ0jQdM@90cm!zq>!v+c2@v<-{)(Y;<;^6l}Uq%=LQa__?@)v9IO~s7+i-N((
z&6T&O=($*XkU1}=gM>gP}9q7W~izqwb}X7f3)m_tq*tX
zhFu*hmU!n}k)+nkxJ$4Jzgdp_v4Yy=z`SOe+QE2x@z$X_R(l#u*^K|9o~uOxMxW?)
zrccZsgVJPTnl7w9uX5t!`#lG2*?J}+`UKxbF6u&`8L4)Fc1mgcrD&?!HW?l$1?AlM
zyJ2E!rS#8EI>5g$;%s+OfinLg+<=LxugzeDZ56H5)Y=});+>AuvqYsmU}hTP^9~KV
zHzn&ESilk~P&uI5teA4EhyKRbRMl&`K^_5b_1N
z1eVKN!xD^?5bj1u(C?C(!Hz4a=}AU15(|ugskI*wOO)aD8;zUx84921^~<(3)8;5&
zdg-#2z(yda6Lr=TLYEMtRTcKqp{>OYYTFb&Rn^qma=XuG+5pc2e06l8c18C8#H9}yVEz{HR08^f|`I|
z@OvHi;j(hbIrOyEf45`*p2~vBS_5463d)fRb_pQ+mhi+Ny|lP37;@=M(i{A8xW|gu
zTbdwm8pE&}wwLoh1Vkas{>zzOkPXy%Xc~Q?i7t&Ca#h$mwO}ErbBOk%s)~KW?R#Z&
zwd+ZY=9602u{&wUb4-Ul9edB}2+m#z7ZOU|{Q=MWtgOZ}5fjjNe;abJ=$k2%fQ?BVnOKv4G2#6Q72C}CwDybflV$*mySsTRD8rXh}
z2&5Ubl*OnAQcfKg)_e&$J#BaA%u%i=j*c90+oSWJL<5Vv_Nz>#=>jro3gIC8vO7pSTsRmw
zL{VxAW!noaE|IiF|sBPcU{Nwt>#cy*;j1TP$VcBXeV4Xg>DrB9Iu^|Rf=#+;6x^%v;>a+P_PP<1Hw@=a;YF5T!pzB>~~o{#WC(RZds
z{TR??DAQOAfgsd6b>+hJ8$thr*D2id>t9nQE~VDw>^qhq@E?R-qE`59l4I7)q
z(6M^q26&!dUJ|yJX%^r|WY?KRSuF1PbVzjmWnt3%NXZ-P}y
z-A|iV1$rasLsf8B(C4PlH_+3ime*fp_&ri^*Sq>K4@ePC$}K8if-F8vO<@+#5^y*>DLU_xQwPXk8QLx8|vLVfslo
z4RI`9p5qa#CzGNnwMoKfoanPUuv7ptvK6D400&tUNd2jN0{Pr_=q>|2&4WAKhfwtTN?ypd
z&Cnz?S5K$8cLkh`W#O9mu@(vv@XM3&w3Tz$K+f2%QKgh{(uro8q`N)|JrLSx;jR;A
zq)?QkW{8kb)cBloC}xRPBn6Fm0!tON*P*Sd^Ym2BBWi51$^j&rRNC}6RsHT`E5L}&
z1>FnF)6g|AoV{zLD(zD&dL?RU%oboI%3^!VvQoe@6n990m@mquy4yv+1ah9f#Q}?x
z&@&<)r^+SrhLB2|&3Pkq!#_o^8Hh;ihYcH2WU4ReR<0onHYa)t?`%E+Y7ze})j%WF
z{pYntuIQzFr!JN0k#`pK0)4KH&hbJyWhs=_ND}akjfcTJ!S1%HdM;ggr_u!YX1WNd
zAHNqLAM=`i)hxVxXY?UDVBJ5V01rbY1$JD(Q{D2gHojei!Ie2F9hH86Is98Qq-MW+
zv?4B%Vd!ao7w#d?h47b&u9isgIhkdq#|w6IAO5l!wi&-)HetX8f*2QoLX_-&^zNV2MWVQMF2!E`kprR)UUFrw^g<5pu0AS6`K1fV4{Lm&+r
z<~n$=oLjqVv$PmO%WZe5=qEq|&`=Qn{z#;nd5*g~1Mmxn+ABG%&&?PguyZGj*|_ne
z&*t?#fkG)w_7>C;sqw;(_NsV?vM#x1
z#ly8uA|Lxu)&O+w2SK)%c4Ky4zc^`Ua|EMH^hw7V>rSd*KIcw%
zDOFdx6v@!XY!mV(xJDN1Gif%xeu4X4-+c6UM9-Z9BW2kS4zODdW*OqvzSVP{+pS|x
zsXR39$?O}$gdqPARkkSWYTyH{_KRY9##3T{E9l(5Vhun}6P`4z${>}QGdmm<@LP3W
zcm?zQZYsI2=$jnRw-=oSSexM!7t?1VrpW!*LTqFeoxhz9XNPw0933zen?9}<64y7&
zwiMt8DII{9*Yo=WEn3d9OJ^Pxh=;`VgdPvaOJUTlnM+A^uv}QToM>LbP3J0Bz7))r
zFdwHyZ+qj%isrlP2)GN!Z0dj#S}w&-(e;8*^=3C@(&ybVaul&{Bu3%g$_U2FZ
zSwBlN||-If=wLy_TjT6^fz8j?DEq>k5MJA7T>Fu5yjbvJ
zzxQUJy9CcSv>Xph#(BjaH^_NPkSnfA=1bf(27-q&Eky%Kf|V`}mi8nflbno`cb@Q`
zdn1KA;b=Zesw*!hQ2g5kplq%X;J7?s2SL3OZ78mG_DTk&+#|$^Y#QoAU@~NOd@cGc
zMQ^_itqg%xAPsOu{oE!>;?NO_z3Ib{kcBHtp@wiY&D`m<1kfBm(cxNNsA_)
zY>id^KX@|EPJ@7BVdUHg|N47<<{c#SMC?s!qCp_r4|J~#Di}}(KSUaA;wsXlaOnss
z+zyyGTw(I4hn5l>;d?#PA5j)n0c{F(75mKxUsL2b#bY=^v!z%?%a`Z
zxrYXTRBR6O1J(z`T4Dd>)*TK0do5;mCF>S+l0}Et;0=D)kO14w?4ktkyFoJetn|3d
zr0d)&mll=2k#F&iLQP?1A5t4&ANOgu`q
zubw#U+G(~`e)C9R!qN4U|D%no(FN#xgJ7|fgSnGEKwdQBO-b{dFlzBGczb>*@twyQ
z1#k#+B2zquVTh@_Rau;O2c9-FBAUzk_$d&Z^5aF1d}>_aR9?CWlu;L47pn_)rTeQ?
zZye}I;hu?n;*KSaU+q}S6jcE$g}m}h*!vyB6&XrjIONA}x9G5))pKza_DU8hmjZaE
zcTS#zB6(XryL%k7VZ&ybu?86Q2zBCe5
z1Lm7>gN}j%zUf%0!{13QR}DYE6VnT|Nn+9KYw-S0j~r+2PSqzZKyEip}=zvZQCm9>-IV!O)m>VU&pEe0Zj*T}H20j&Y@~cSWnj=5iYs6^6}|oiryE(@-KY$%4*d
zrYkRhS4boURh+R3B{v_LIj*Vd{=8d7CyRCnh2xcfRlyCMM~-`#o({A6*(jbQ=t^
zTCx%P5an&N7~`6B?h~xh}2L&`G_;omrEmSD-o8?9O;!^HGHn#j8qF`
zauzcrQrfF;DQ2ub7458Tz{gE$`cxFpEs0(k=CeUO+Zntu;5!aSR_}C2fuTs)el8&E
zO`f?G+;?=86-q-Tof^2Za#h>+0>he=YG3TFV@(rJ!THuK0fiAz7Kt=baX#c!dEpwR
z#dI(SK#@6l78iB`_QqhF$B)xo-Zyy-UwU)xXJRmR65p{bugkazsShkSy1X-|EgB-!
zA2QFeB@Z~}On00(qBZG|7n1xzTc67a_x`2CxuzzTS#HyUj+r5sNU}sz8yQRlD(yaO
z*>cLNIx$Y~dh2ebyG66e@Z2mM)gv3=I#bHiHZO~1Si8TkuXnRcMk`_+%OqM&W8dbr
z<;53xlqa8_J4bYFm>W&Klg@(nF+%yP*r;h9=?`)my&7OIW|Qm{VNDyLWdp6>&O}1{
zt@8$p@@&C?;fS}r6ubWtp(2$t$i6odG{dliD8?SKRJ@Dhu7e{HXw619Qp%++_#^;O
z-mE?*%=>-$X*Dv=u`rKwNLx0#KG*@o)8*FP9~}9S2zwSp{OJ
zv^*Q10FWPK@vq4Am2!hNU^^nplsL=%wm>RP&|GP|Q;K+phbu4oYmB<>NGn{x
zzH@ZlM2j}y14mkHrD2D2Yb?Z}&i>PTgK?=phNI%@FZLzLFORgrB^chDfuG(RB8{V;
zY7)!Y<^Sing`wgxDzuc+v0t{^%KZAmwewk-_u_B&;|w+}hKlkkO-@zt!-59R>isgu
zHp3D%M}4X)em!}+&0vSEAM4!A(Og}ikzT?TxD4jK$B^IR!40<0QFb<$3GYP}k7x}T
zN!+b~hyOI;FgY;rJ_(xKObcUfcUyBRB3?%#YN}6CF0=)gw`GU~S`Q$AUaw>wa)OA*
zl^3LMVl>?SjFlg&fK@@5edCY(H0+=*8rX4!+5i@9C=zmth<|X?Pv+F+0xR#SR8lg3
z`7b@C{VudDp7F0ssbJUrAV5jX{D7Ojw!lIrV-tQ90eB(M@&8w+#Q%8lgJbShQ!?lK
zFC9bj>Rt&r$C%cloW?!d9}
zlBMaKAOO4Q#0qYciUU;?fV)0C0|ybD0WwX}Tay