From 81ab29c8b7e5fd1a83f734cf5908d5748bd8129c Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Fri, 21 Nov 2025 07:33:26 +0000 Subject: [PATCH] Update latest GitHub pages to v1.2.0rc3 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 6157 ++--- latest/_cpp_gen/runtime.html | 19741 ++++++++-------- .../attention.py | 290 +- .../model_engine.py | 385 +- latest/_modules/index.html | 27 +- latest/_modules/tensorrt_llm/builder.html | 27 +- .../tensorrt_llm/disaggregated_params.html | 27 +- .../tensorrt_llm/executor/request.html | 27 +- .../tensorrt_llm/executor/result.html | 64 +- .../_modules/tensorrt_llm/executor/utils.html | 27 +- latest/_modules/tensorrt_llm/functional.html | 27 +- .../tensorrt_llm/layers/activation.html | 27 +- .../tensorrt_llm/layers/attention.html | 27 +- latest/_modules/tensorrt_llm/layers/cast.html | 27 +- latest/_modules/tensorrt_llm/layers/conv.html | 27 +- .../tensorrt_llm/layers/embedding.html | 27 +- .../_modules/tensorrt_llm/layers/linear.html | 27 +- latest/_modules/tensorrt_llm/layers/mlp.html | 27 +- .../tensorrt_llm/layers/normalization.html | 27 +- .../_modules/tensorrt_llm/layers/pooling.html | 27 +- .../tensorrt_llm/llmapi/build_cache.html | 27 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 27 +- .../tensorrt_llm/llmapi/llm_args.html | 307 +- .../tensorrt_llm/llmapi/mm_encoder.html | 27 +- .../tensorrt_llm/llmapi/mpi_session.html | 27 +- .../tensorrt_llm/models/baichuan/model.html | 27 +- .../tensorrt_llm/models/bert/model.html | 27 +- .../tensorrt_llm/models/bloom/model.html | 27 +- .../tensorrt_llm/models/chatglm/config.html | 27 +- .../tensorrt_llm/models/chatglm/model.html | 27 +- .../tensorrt_llm/models/clip/model.html | 27 +- .../tensorrt_llm/models/cogvlm/config.html | 27 +- .../tensorrt_llm/models/cogvlm/model.html | 27 +- .../tensorrt_llm/models/commandr/model.html | 27 +- .../tensorrt_llm/models/dbrx/config.html | 27 +- .../tensorrt_llm/models/dbrx/model.html | 27 +- .../models/deepseek_v1/model.html | 27 +- .../models/deepseek_v2/model.html | 27 +- .../tensorrt_llm/models/dit/model.html | 27 +- .../tensorrt_llm/models/eagle/model.html | 27 +- .../tensorrt_llm/models/enc_dec/model.html | 27 +- .../tensorrt_llm/models/falcon/config.html | 27 +- .../tensorrt_llm/models/falcon/model.html | 27 +- .../tensorrt_llm/models/gemma/config.html | 27 +- .../tensorrt_llm/models/gemma/model.html | 27 +- .../tensorrt_llm/models/gpt/config.html | 27 +- .../tensorrt_llm/models/gpt/model.html | 27 +- .../tensorrt_llm/models/gptj/config.html | 27 +- .../tensorrt_llm/models/gptj/model.html | 27 +- .../tensorrt_llm/models/gptneox/model.html | 27 +- .../tensorrt_llm/models/llama/config.html | 27 +- .../tensorrt_llm/models/llama/model.html | 27 +- .../tensorrt_llm/models/mamba/model.html | 27 +- .../tensorrt_llm/models/medusa/config.html | 27 +- .../tensorrt_llm/models/medusa/model.html | 27 +- .../tensorrt_llm/models/mllama/model.html | 27 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 27 +- .../tensorrt_llm/models/modeling_utils.html | 27 +- .../tensorrt_llm/models/mpt/model.html | 27 +- .../models/multimodal_encoders/config.html | 27 +- .../models/multimodal_encoders/model.html | 27 +- .../tensorrt_llm/models/opt/model.html | 27 +- .../tensorrt_llm/models/phi/model.html | 27 +- .../tensorrt_llm/models/phi3/model.html | 27 +- .../models/recurrentgemma/model.html | 27 +- .../tensorrt_llm/models/redrafter/model.html | 27 +- .../_modules/tensorrt_llm/plugin/plugin.html | 27 +- .../tensorrt_llm/quantization/mode.html | 27 +- .../quantization/quantize_by_modelopt.html | 27 +- .../runtime/enc_dec_model_runner.html | 27 +- .../tensorrt_llm/runtime/generation.html | 27 +- .../runtime/kv_cache_manager.html | 27 +- .../tensorrt_llm/runtime/model_runner.html | 27 +- .../runtime/model_runner_cpp.html | 27 +- .../runtime/multimodal_model_runner.html | 27 +- .../tensorrt_llm/runtime/session.html | 27 +- .../tensorrt_llm/sampling_params.html | 27 +- latest/_sources/_cpp_gen/executor.rst.txt | 42 +- latest/_sources/_cpp_gen/runtime.rst.txt | 308 +- .../run-benchmark-with-trtllm-serve.md.txt | 2 +- ...ent-guide-for-deepseek-r1-on-trtllm.md.txt | 2 +- ...loyment-guide-for-gpt-oss-on-trtllm.md.txt | 2 +- ...nt-guide-for-llama3.3-70b-on-trtllm.md.txt | 2 +- ...nt-guide-for-llama4-scout-on-trtllm.md.txt | 2 +- .../developer-guide/api-change.md.txt | 40 +- .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../deepseek_r1_reasoning_parser.rst.txt | 2 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 2 +- .../llm_inference_async_streaming.rst.txt | 2 +- .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_kv_cache_connector.rst.txt | 2 +- .../examples/llm_kv_cache_offloading.rst.txt | 2 +- .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_mgmn_llm_distributed.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 2 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- latest/_sources/examples/llm_runtime.rst.txt | 2 +- latest/_sources/examples/llm_sampling.rst.txt | 2 +- .../examples/llm_sparse_attention.rst.txt | 4 +- .../examples/llm_speculative_decoding.rst.txt | 2 +- .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 2 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 2 +- ...enai_completion_client_json_schema.rst.txt | 2 +- .../_sources/features/multi-modality.md.txt | 2 +- latest/_sources/installation/linux.md.txt | 12 + latest/_sources/llm-api/reference.rst.txt | 4 +- latest/_sources/quick-start-guide.md.txt | 2 +- latest/_static/styles/nvidia-sphinx-theme.css | 2 +- .../styles/nvidia-sphinx-theme.css.map | 2 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 27 +- latest/blogs/Falcon180B-H200.html | 27 +- latest/blogs/H100vsA100.html | 27 +- latest/blogs/H200launch.html | 27 +- latest/blogs/XQA-kernel.html | 27 +- latest/blogs/quantization-in-TRT-LLM.html | 27 +- .../blog10_ADP_Balance_Strategy.html | 27 +- .../tech_blog/blog11_GPT_OSS_Eagle3.html | 27 +- ...ded_Decoding_and_Speculative_Decoding.html | 27 +- ...ompute_Implementation_in_TensorRT-LLM.html | 27 +- ...ert_Parallelism_in_TensorRT-LLM_part3.html | 27 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 27 +- ...1_MTP_Implementation_and_Optimization.html | 27 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 27 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 27 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 27 +- .../blog6_Llama4_maverick_eagle_guide.html | 27 +- ...formance_Analysis_And_Auto_Enablement.html | 27 +- ...ert_Parallelism_in_TensorRT-LLM_part2.html | 27 +- .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 27 +- latest/commands/trtllm-bench.html | 27 +- latest/commands/trtllm-build.html | 27 +- latest/commands/trtllm-eval.html | 27 +- latest/commands/trtllm-serve/index.html | 27 +- .../run-benchmark-with-trtllm-serve.html | 29 +- .../commands/trtllm-serve/trtllm-serve.html | 29 +- ...yment-guide-for-deepseek-r1-on-trtllm.html | 29 +- ...eployment-guide-for-gpt-oss-on-trtllm.html | 29 +- ...ment-guide-for-llama3.3-70b-on-trtllm.html | 29 +- ...ment-guide-for-llama4-scout-on-trtllm.html | 29 +- ...oyment-guide-for-qwen3-next-on-trtllm.html | 27 +- latest/deployment-guide/index.html | 27 +- latest/developer-guide/api-change.html | 80 +- latest/developer-guide/ci-overview.html | 27 +- latest/developer-guide/dev-containers.html | 27 +- latest/developer-guide/kv-transfer.html | 27 +- latest/developer-guide/overview.html | 27 +- latest/developer-guide/perf-analysis.html | 27 +- latest/developer-guide/perf-benchmarking.html | 27 +- latest/developer-guide/perf-overview.html | 31 +- latest/examples/curl_chat_client.html | 29 +- .../curl_chat_client_for_multimodal.html | 29 +- latest/examples/curl_completion_client.html | 29 +- latest/examples/customization.html | 27 +- .../deepseek_r1_reasoning_parser.html | 29 +- latest/examples/dynamo_k8s_example.html | 27 +- latest/examples/genai_perf_client.html | 29 +- .../genai_perf_client_for_multimodal.html | 29 +- latest/examples/index.html | 27 +- latest/examples/kvcacheconfig.html | 27 +- latest/examples/kvcacheretentionconfig.html | 27 +- latest/examples/llm_api_examples.html | 27 +- latest/examples/llm_guided_decoding.html | 29 +- latest/examples/llm_inference.html | 29 +- latest/examples/llm_inference_async.html | 29 +- .../llm_inference_async_streaming.html | 29 +- .../examples/llm_inference_distributed.html | 29 +- latest/examples/llm_kv_cache_connector.html | 29 +- latest/examples/llm_kv_cache_offloading.html | 29 +- latest/examples/llm_logits_processor.html | 29 +- latest/examples/llm_mgmn_llm_distributed.html | 29 +- latest/examples/llm_mgmn_trtllm_bench.html | 29 +- latest/examples/llm_mgmn_trtllm_serve.html | 29 +- latest/examples/llm_multilora.html | 29 +- latest/examples/llm_runtime.html | 29 +- latest/examples/llm_sampling.html | 29 +- latest/examples/llm_sparse_attention.html | 416 +- latest/examples/llm_speculative_decoding.html | 29 +- latest/examples/openai_chat_client.html | 29 +- .../openai_chat_client_for_multimodal.html | 29 +- latest/examples/openai_completion_client.html | 29 +- .../openai_completion_client_for_lora.html | 29 +- .../openai_completion_client_json_schema.html | 29 +- latest/examples/trtllm_serve_examples.html | 27 +- latest/features/additional-outputs.html | 27 +- latest/features/attention.html | 29 +- .../benchmarking_with_trtllm_bench.html | 27 +- .../auto_deploy/advanced/example_run.html | 27 +- .../advanced/expert_configurations.html | 27 +- .../auto_deploy/advanced/logging.html | 27 +- .../auto_deploy/advanced/workflow.html | 27 +- latest/features/auto_deploy/auto-deploy.html | 27 +- .../features/auto_deploy/support_matrix.html | 27 +- latest/features/checkpoint-loading.html | 27 +- latest/features/disagg-serving.html | 27 +- .../features/feature-combination-matrix.html | 27 +- latest/features/kvcache.html | 27 +- latest/features/long-sequence.html | 27 +- latest/features/lora.html | 27 +- latest/features/multi-modality.html | 33 +- latest/features/overlap-scheduler.html | 27 +- .../paged-attention-ifb-scheduler.html | 31 +- latest/features/parallel-strategy.html | 27 +- latest/features/quantization.html | 27 +- latest/features/ray-orchestrator.html | 27 +- latest/features/sampling.html | 37 +- latest/features/speculative-decoding.html | 27 +- ...orch_compile_and_piecewise_cuda_graph.html | 27 +- latest/genindex.html | 147 +- latest/index.html | 27 +- .../installation/build-from-source-linux.html | 29 +- latest/installation/containers.html | 29 +- latest/installation/index.html | 27 +- latest/installation/linux.html | 36 +- .../advanced/disaggregated-service.html | 27 +- latest/legacy/advanced/executor.html | 37 +- .../legacy/advanced/expert-parallelism.html | 27 +- latest/legacy/advanced/gpt-attention.html | 31 +- latest/legacy/advanced/gpt-runtime.html | 27 +- latest/legacy/advanced/graph-rewriting.html | 27 +- .../legacy/advanced/kv-cache-management.html | 27 +- latest/legacy/advanced/kv-cache-reuse.html | 27 +- latest/legacy/advanced/lora.html | 27 +- .../advanced/lowprecision-pcie-allreduce.html | 27 +- .../open-sourced-cutlass-kernels.html | 27 +- .../legacy/advanced/speculative-decoding.html | 27 +- latest/legacy/advanced/weight-streaming.html | 27 +- latest/legacy/architecture/add-model.html | 27 +- latest/legacy/architecture/checkpoint.html | 27 +- latest/legacy/architecture/core-concepts.html | 37 +- .../architecture/model-weights-loader.html | 27 +- latest/legacy/architecture/workflow.html | 27 +- .../build-image-to-dockerhub.html | 27 +- latest/legacy/dev-on-cloud/dev-on-runpod.html | 27 +- latest/legacy/key-features.html | 27 +- latest/legacy/performance/perf-analysis.html | 27 +- .../legacy/performance/perf-benchmarking.html | 27 +- .../benchmarking-default-performance.html | 27 +- .../deciding-model-sharding-strategy.html | 27 +- .../fp8-quantization.html | 27 +- .../performance-tuning-guide/index.html | 27 +- .../introduction.html | 27 +- ...ing-max-batch-size-and-max-num-tokens.html | 27 +- .../useful-build-time-flags.html | 27 +- .../useful-runtime-flags.html | 27 +- .../python-api/tensorrt_llm.functional.html | 27 +- .../python-api/tensorrt_llm.layers.html | 27 +- .../python-api/tensorrt_llm.models.html | 27 +- .../python-api/tensorrt_llm.plugin.html | 27 +- .../python-api/tensorrt_llm.quantization.html | 27 +- .../python-api/tensorrt_llm.runtime.html | 27 +- latest/legacy/reference/memory.html | 31 +- .../multimodal-feature-support-matrix.html | 27 +- latest/legacy/reference/precision.html | 47 +- latest/legacy/reference/support-matrix.html | 27 +- latest/legacy/reference/troubleshooting.html | 27 +- latest/legacy/tensorrt_quickstart.html | 27 +- latest/legacy/torch.html | 27 +- latest/llm-api/index.html | 27 +- latest/llm-api/reference.html | 253 +- latest/models/adding-new-model.html | 27 +- latest/models/supported-models.html | 27 +- latest/objects.inv | Bin 181168 -> 181367 bytes latest/overview.html | 29 +- latest/py-modindex.html | 27 +- latest/quick-start-guide.html | 29 +- latest/release-notes.html | 27 +- latest/search.html | 27 +- latest/searchindex.js | 2 +- latest/torch/adding_new_model.html | 27 +- latest/torch/arch_overview.html | 27 +- latest/torch/attention.html | 27 +- .../benchmarking_with_trtllm_bench.html | 27 +- .../auto_deploy/advanced/example_run.html | 27 +- .../advanced/expert_configurations.html | 27 +- .../torch/auto_deploy/advanced/logging.html | 27 +- .../advanced/serving_with_trtllm_serve.html | 27 +- .../torch/auto_deploy/advanced/workflow.html | 27 +- latest/torch/auto_deploy/auto-deploy.html | 27 +- latest/torch/auto_deploy/support_matrix.html | 27 +- latest/torch/features/checkpoint_loading.html | 27 +- latest/torch/features/lora.html | 27 +- latest/torch/features/overlap_scheduler.html | 27 +- latest/torch/features/quantization.html | 27 +- latest/torch/features/sampling.html | 27 +- latest/torch/kv_cache_manager.html | 27 +- latest/torch/scheduler.html | 27 +- 296 files changed, 18530 insertions(+), 16483 deletions(-) diff --git a/latest/.buildinfo b/latest/.buildinfo index 1f24c7367c..e1872b4a6c 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: a9c5c8c57021602368f541d74d22523d +config: 5b10b2153627779ea5be4dbb07d82396 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index 182be9c38a..824e5f6543 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,20 +61,24 @@ + + + - + + @@ -80,6 +86,8 @@ + +
@@ -506,8 +514,8 @@

Executor#

-
-

transferAgent.h#

+
+

disaggServerUtil.h#

namespace tensorrt_llm#
@@ -515,6 +523,749 @@
namespace executor#
+
+namespace disagg_executor#
+
+
+class DisaggExecutorOrchestrator#
+
+

Public Functions

+
+
+DisaggExecutorOrchestrator( + +
+
std::vector<std::filesystem::path> const &ctxEnginePaths,
+
std::vector<std::filesystem::path> const &genEnginePaths,
+
std::vector<executor::ExecutorConfig> const &ctxExecutorConfigs,
+
std::vector<executor::ExecutorConfig> const &genExecutorConfigs,
+
bool hasContextAwaitThreads,
+
bool hasGenAwaitThreads,
+
+ +)#
+

Constructs a DisaggExecutorOrchestrator object.

+
+
Parameters:
+
    +
  • ctxEnginePaths – A vector of file paths to context engine files.

  • +
  • genEnginePaths – A vector of file paths to generation engine files.

  • +
  • ctxExecutorConfigs – A vector of ExecutorConfig for context executors.

  • +
  • genExecutorConfigs – A vector of ExecutorConfig for generation executors.

  • +
  • hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.

  • +
  • hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.

  • +
+
+
+
+ +
+
+std::vector<IdType> enqueueContext( + +
+
std::vector<texec::Request> const &requests,
+
std::optional<int> selectContextId = std::nullopt,
+
bool batch = false,
+
+ +)#
+

Enqueue context-only requests to context executors.

+
+
Parameters:
+
    +
  • requests – A vector of context-only requests.

  • +
  • selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.

  • +
  • batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.

  • +
+
+
Returns:
+

A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.

+
+
+
+ +
+
+void enqueueGeneration( + +
+
std::vector<texec::Request> const &requests,
+
std::vector<IdType> const &globalRequestIds,
+
std::optional<int> selectGenIdx = std::nullopt,
+
bool batch = false,
+
+ +)#
+

Enqueue generation-only requests to generation executors.

+
+
Parameters:
+
    +
  • requests – A vector of generation-only requests.

  • +
  • globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.

  • +
  • selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.

  • +
  • batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.

  • +
+
+
+
+ +
+
+std::vector<ResponseWithId> awaitContextResponses( + +
+
std::optional<std::chrono::milliseconds> const &timeout,
+
std::optional<int> contextIdx = std::nullopt,
+
+ +)#
+

Await for context responses.

+
+
Parameters:
+
    +
  • timeout – The maximum time to wait for new responses

  • +
  • contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.

  • +
+
+
Returns:
+

A vector of responses with corresponding global request ids

+
+
+
+ +
+
+std::vector<ResponseWithId> awaitGenerationResponses( + +
+
std::optional<std::chrono::milliseconds> const &timeout,
+
std::optional<int> genIdx = std::nullopt,
+
+ +)#
+

Await for generation responses.

+
+
Parameters:
+
    +
  • timeout – The maximum time to wait for new responses.

  • +
  • genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.

  • +
+
+
Returns:
+

A vector of responses with corresponding global request ids.

+
+
+
+ +
+
+bool canEnqueue() const#
+

Indicates if the current process is allowed to enqueueRequests.

+
+ +
+
+std::vector<std::unique_ptr<texec::Executor>> const &getContextExecutors( + +
+
+ +) const#
+

Get context executors.

+
+ +
+
+std::vector<std::unique_ptr<texec::Executor>> const &getGenExecutors( + +
+
+ +) const#
+

Get generation executors.

+
+ +
+
+~DisaggExecutorOrchestrator()#
+
+ +
+
+

Private Members

+
+
+std::unique_ptr<Impl> mImpl#
+
+ +
+
+ +
+
+struct ResponseWithId#
+
+

Public Functions

+
+
+inline ResponseWithId( + +
+
tensorrt_llm::executor::Response &&response,
+
IdType gid,
+
+ +)#
+
+ +
+
+inline ResponseWithId( + +
+
tensorrt_llm::executor::Response const &response,
+
IdType gid,
+
+ +)#
+
+ +
+
+inline ResponseWithId(ResponseWithId &&other) noexcept#
+
+ +
+
+ResponseWithId(ResponseWithId const &other) = default#
+
+ +
+
+inline ResponseWithId &operator=(ResponseWithId &&other) noexcept#
+
+ +
+
+inline ResponseWithId &operator=(ResponseWithId const &other)#
+
+ +
+
+~ResponseWithId() = default#
+
+ +
+
+

Public Members

+
+
+tensorrt_llm::executor::Response response#
+
+ +
+
+IdType gid#
+
+ +
+
+ +
+ +
+ + + +
+
+

tensor.h#

+
+
+namespace tensorrt_llm
+
+
+namespace executor
+
+
+class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>#
+
+

Public Types

+
+
+using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>#
+
+ +
+
+using DimType64 = typename std::remove_cv_t<Base::value_type>#
+
+ +
+
+

Public Functions

+
+
+inline Shape()#
+
+ +
+
+inline Shape(DimType64 const *data, Base::size_type size)#
+
+ +
+
+inline Shape(std::initializer_list<DimType64> dims)#
+
+ +
+
+ +
+
+class Tensor#
+
+

Public Types

+
+
+using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>#
+
+ +
+
+

Public Functions

+
+
+Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const#
+
+ +
+
+Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const#
+
+ +
+
+Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const#
+
+ +
+
+Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const#
+
+ +
+
+Tensor copyToGpu(Tensor::CudaStreamPtr stream) const#
+
+ +
+
+Tensor() noexcept = default#
+
+ +
+
+~Tensor() = default#
+
+ +
+
+Tensor(Tensor const &other) noexcept = default#
+
+ +
+
+Tensor(Tensor &&other) noexcept = default#
+
+ +
+
+Tensor &operator=(Tensor const &other) noexcept = default#
+
+ +
+
+Tensor &operator=(Tensor &&other) noexcept = default#
+
+ +
+
+void *getData()#
+

Returns a pointer to underlying array.

+
+ +
+
+void const *getData() const#
+

Returns a pointer to underlying array.

+
+ +
+
+DataType getDataType() const#
+

Returns the data type of the buffer.

+
+ +
+
+MemoryType getMemoryType() const#
+

Returns the memory type of the buffer.

+
+ +
+
+Shape getShape() const#
+

Returns the tensor dimensions.

+
+ +
+
+std::size_t getSize() const#
+

Returns the number of elements in the tensor.

+
+ +
+
+std::size_t getSizeInBytes() const#
+

Returns the size of the tensor in bytes.

+
+ +
+
+void setZero(CudaStreamPtr stream = nullptr)#
+

Set the entire memory to zero.

+
+
Parameters:
+

stream – Must be a valid CUDA stream if the memory type is GPU.

+
+
+
+ +
+
+void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)#
+

Copy the data and shape from another tensor.

+
+
Parameters:
+
    +
  • other – A tensor to copy from.

  • +
  • stream – Must be a valid CUDA stream if the memory type is GPU.

  • +
+
+
+
+ +
+
+inline explicit operator bool() const#
+
+ +
+
+inline bool operator==(Tensor const &rhs) const#
+
+ +
+
+inline bool operator!=(Tensor const &rhs) const#
+
+ +
+
+

Public Static Functions

+
+
+static Tensor cpu(DataType dataType, Shape shape = {})#
+

Allocate a cpu tensor with the given shape and data type.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor cpu(Shape shape = {})#
+
+ +
+
+static Tensor pinned(DataType dataType, Shape shape = {})#
+

Allocate a cpu tensor in pinned memory with the given shape and data type.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor pinned(Shape shape = {})#
+
+ +
+
+static Tensor pooledPinned(DataType dataType, Shape shape = {})#
+

Allocate a cpu tensor in pooled pinned memory with the given shape and data type.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor pooledPinned( + +
+
Shape shape = {},
+
+ +)#
+
+ +
+
+static Tensor managed(DataType dataType, Shape shape = {})#
+

Allocate a tensor in managed memory (UVM) with the given shape and data type.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor managed(Shape shape = {})#
+
+ +
+
+static Tensor gpu( + +
+
DataType dataType,
+
CudaStreamPtr stream,
+
Shape shape = {},
+
+ +)#
+

Allocate a gpu tensor with the given shape and data type on a particular cuda stream.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • +
  • dataType – The data type of the tensor.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor gpu( + +
+
CudaStreamPtr stream,
+
Shape shape = {},
+
+ +)#
+
+ +
+
+static Tensor of(DataType dataType, void *data, Shape shape)#
+

Wrap a data pointer into a tensor without taking ownership.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor of(T *data, Shape shape)#
+

Wrap a data pointer into a tensor without taking ownership.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • +
+
+
+
+ +
+
+template<typename T>
static inline Tensor of(T &data)#
+

Wrap any container into a tensor without taking ownership.

+
+
Parameters:
+
    +
  • shape – The shape of the tensor.

  • +
  • dataType – The data type of the tensor.

  • +
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • +
+
+
+
+ +
+
+

Private Types

+
+
+using Impl = runtime::ITensor#
+
+ +
+
+

Private Functions

+
+
+explicit Tensor(std::shared_ptr<runtime::ITensor> tensor)#
+
+ +
+
+Tensor copyTo( + +
+
std::shared_ptr<Impl> tensor,
+
CudaStreamPtr stream,
+
+ +) const#
+
+ +
+
+

Private Members

+
+
+std::shared_ptr<Impl> mTensor#
+
+ +
+
+

Private Static Functions

+
+
+template<typename T>
static inline DataType getRuntimeType()#
+
+ +
+
+

Friends

+
+
+friend class Serialization
+
+ +
+
+friend std::shared_ptr<runtime::ITensor> const &toITensor( + +
+
Tensor const &tensor,
+
+ +)#
+
+ +
+
+friend Tensor ofITensor( + +
+
std::shared_ptr<runtime::ITensor> tensor,
+
+ +)#
+
+ +
+
+ +
+
+namespace detail#
+
+

Typedefs

+
+
+using DimType64 = int64_t#
+
+ +
+
+

Functions

+
+
+std::shared_ptr<runtime::ITensor> const &toITensor( + +
+
Tensor const &tensor,
+
+ +)#
+
+ +
+
+Tensor ofITensor(std::shared_ptr<runtime::ITensor> tensor)#
+
+ +
+
+ +
+ +
+
+namespace runtime#
+
+ +
+ +
+
+

transferAgent.h#

+
+
+namespace tensorrt_llm
+
+
+namespace executor
+
namespace kv_cache#
@@ -1281,6 +2032,1741 @@
+
+
+

serialization.h#

+
+
+namespace tensorrt_llm
+
+
+namespace executor
+
+
+class Serialization#
+
+

Public Static Functions

+
+
+static size_t serializedSize( + +
+
tensorrt_llm::batch_manager::kv_cache_manager::BlockKey const &key,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
tensorrt_llm::batch_manager::kv_cache_manager::BlockKey const &key,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static tensorrt_llm::batch_manager::kv_cache_manager::BlockKey deserializeBlockKey( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static RequestPerfMetrics::TimePoint deserializeTimePoint( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
RequestPerfMetrics::TimePoint const &tp,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(RequestPerfMetrics::TimePoint const&)#
+
+ +
+
+static RequestPerfMetrics deserializeRequestPerfMetrics( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
RequestPerfMetrics const &metrics,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(RequestPerfMetrics const &metrics)#
+
+ +
+
+static SamplingConfig deserializeSamplingConfig(std::istream &is)#
+
+ +
+
+static void serialize(SamplingConfig const &config, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(SamplingConfig const &config)#
+
+ +
+
+static OutputConfig deserializeOutputConfig(std::istream &is)#
+
+ +
+
+static void serialize(OutputConfig const &config, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(OutputConfig const &config)#
+
+ +
+
+static AdditionalModelOutput deserializeAdditionalModelOutput( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
AdditionalModelOutput const &additionalModelOutput,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
AdditionalModelOutput const &additionalModelOutput,
+
+ +)#
+
+ +
+
+static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
ExternalDraftTokensConfig const &config,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(ExternalDraftTokensConfig const &config)#
+
+ +
+
+static PromptTuningConfig deserializePromptTuningConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
PromptTuningConfig const &config,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(PromptTuningConfig const &config)#
+
+ +
+
+static MultimodalInput deserializeMultimodalInput(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
MultimodalInput const &multimodalInput,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(MultimodalInput const &multimodalInput)#
+
+ +
+
+static MropeConfig deserializeMropeConfig(std::istream &is)#
+
+ +
+
+static void serialize(MropeConfig const &config, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(MropeConfig const &config)#
+
+ +
+
+static LoraConfig deserializeLoraConfig(std::istream &is)#
+
+ +
+
+static void serialize(LoraConfig const &config, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(LoraConfig const &config)#
+
+ +
+
+static kv_cache::CommState deserializeCommState(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
kv_cache::CommState const &state,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(kv_cache::CommState const &state)#
+
+ +
+
+static kv_cache::SocketState deserializeSocketState(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
kv_cache::SocketState const &state,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(kv_cache::SocketState const &state)#
+
+ +
+
+static kv_cache::AgentState deserializeAgentState(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
kv_cache::AgentState const &state,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(kv_cache::AgentState const &state)#
+
+ +
+
+static kv_cache::CacheState deserializeCacheState(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
kv_cache::CacheState const &state,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(kv_cache::CacheState const &state)#
+
+ +
+
+static DataTransceiverState deserializeDataTransceiverState( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static DataTransceiverState deserializeDataTransceiverState( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
DataTransceiverState const &dataTransceiverState,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static std::vector<char> serialize( + +
+
DataTransceiverState const &dataTransceiverState,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
DataTransceiverState const &dataTransceiverState,
+
+ +)#
+
+ +
+
+static ContextPhaseParams deserializeContextPhaseParams( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
ContextPhaseParams const &contextPhaseParams,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
ContextPhaseParams const &contextPhaseParams,
+
+ +)#
+
+ +
+
+static Request deserializeRequest(std::istream &is)#
+
+ +
+
+static void serialize(Request const &request, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(Request const &request)#
+
+ +
+
+static Tensor deserializeTensor(std::istream &is)#
+
+ +
+
+static void serialize(Tensor const &tensor, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(Tensor const &tensor)#
+
+ +
+
+static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
SpeculativeDecodingFastLogitsInfo const &info,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
SpeculativeDecodingFastLogitsInfo const &info,
+
+ +)#
+
+ +
+
+static Result deserializeResult(std::istream &is)#
+
+ +
+
+static void serialize(Result const &result, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(Result const &result)#
+
+ +
+
+static AdditionalOutput deserializeAdditionalOutput(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
AdditionalOutput const &additionalOutput,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
AdditionalOutput const &additionalOutput,
+
+ +)#
+
+ +
+
+static Response deserializeResponse(std::istream &is)#
+
+ +
+
+static void serialize(Response const &response, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(Response const &response)#
+
+ +
+
+static std::vector<Response> deserializeResponses( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static std::vector<char> serialize( + +
+
std::vector<Response> const &responses,
+
+ +)#
+
+ +
+
+static KvCacheConfig deserializeKvCacheConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
KvCacheConfig const &kvCacheConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KvCacheConfig const &kvCacheConfig)#
+
+ +
+
+static DynamicBatchConfig deserializeDynamicBatchConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
DynamicBatchConfig const &dynamicBatchConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
DynamicBatchConfig const &dynamicBatchConfig,
+
+ +)#
+
+ +
+
+static SchedulerConfig deserializeSchedulerConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
SchedulerConfig const &schedulerConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(SchedulerConfig const &schedulerConfig)#
+
+ +
+
+static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
+
+ +)#
+
+ +
+
+static ParallelConfig deserializeParallelConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
ParallelConfig const &parallelConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(ParallelConfig const &parallelConfig)#
+
+ +
+
+static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
PeftCacheConfig const &peftCacheConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)#
+
+ +
+
+static OrchestratorConfig deserializeOrchestratorConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
OrchestratorConfig const &orchestratorConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
OrchestratorConfig const &orchestratorConfig,
+
+ +)#
+
+ +
+
+static DecodingMode deserializeDecodingMode(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
DecodingMode const &decodingMode,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(DecodingMode const &decodingMode)#
+
+ +
+
+static LookaheadDecodingConfig deserializeLookaheadDecodingConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
LookaheadDecodingConfig const &lookaheadDecodingConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
LookaheadDecodingConfig const &lookaheadDecodingConfig,
+
+ +)#
+
+ +
+
+static EagleConfig deserializeEagleConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
EagleConfig const &eagleConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(EagleConfig const &eagleConfig)#
+
+ +
+
+static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
SpeculativeDecodingConfig const &specDecConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
SpeculativeDecodingConfig const &specDecConfig,
+
+ +)#
+
+ +
+
+static GuidedDecodingConfig deserializeGuidedDecodingConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
GuidedDecodingConfig const &guidedDecodingConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
GuidedDecodingConfig const &guidedDecodingConfig,
+
+ +)#
+
+ +
+
+static GuidedDecodingParams deserializeGuidedDecodingParams( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
GuidedDecodingParams const &guidedDecodingParams,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
GuidedDecodingParams const &guidedDecodingParams,
+
+ +)#
+
+ +
+
+static KvCacheRetentionConfig deserializeKvCacheRetentionConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
KvCacheRetentionConfig const &kvCacheRetentionConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
KvCacheRetentionConfig const &kvCacheRetentionConfig,
+
+ +)#
+
+ +
+
+static KvCacheRetentionConfig::TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig,
+
+ +)#
+
+ +
+
+static DecodingConfig deserializeDecodingConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
DecodingConfig const &decodingConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(DecodingConfig const &decodingConfig)#
+
+ +
+
+static DebugConfig deserializeDebugConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
DebugConfig const &debugConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(DebugConfig const &debugConfig)#
+
+ +
+
+static CacheTransceiverConfig deserializeCacheTransceiverConfig( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
CacheTransceiverConfig const &cacheTransceiverConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
CacheTransceiverConfig const &cacheTransceiverConfig,
+
+ +)#
+
+ +
+
+static ExecutorConfig deserializeExecutorConfig(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
ExecutorConfig const &executorConfig,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(ExecutorConfig const &executorConfig)#
+
+ +
+
+static KvCacheStats deserializeKvCacheStats(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
KvCacheStats const &kvCacheStats,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KvCacheStats const &kvCacheStats)#
+
+ +
+
+static StaticBatchingStats deserializeStaticBatchingStats( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
StaticBatchingStats const &staticBatchingStats,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
StaticBatchingStats const &staticBatchingStats,
+
+ +)#
+
+ +
+
+static InflightBatchingStats deserializeInflightBatchingStats( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
InflightBatchingStats const &inflightBatchingStats,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
InflightBatchingStats const &inflightBatchingStats,
+
+ +)#
+
+ +
+
+static SpecDecodingStats deserializeSpecDecodingStats( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
SpecDecodingStats const &specDecodingStats,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
SpecDecodingStats const &specDecodingStats,
+
+ +)#
+
+ +
+
+static IterationStats deserializeIterationStats( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static IterationStats deserializeIterationStats(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
IterationStats const &iterStats,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static std::vector<char> serialize(IterationStats const &iterStats)#
+
+ +
+
+static size_t serializedSize(IterationStats const &iterStats)#
+
+ +
+
+static std::vector<char> serialize( + +
+
std::vector<IterationStats> const &iterStatsVec,
+
+ +)#
+
+ +
+
+static std::vector<IterationStats> deserializeIterationStatsVec( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static DisServingRequestStats deserializeDisServingRequestStats( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
DisServingRequestStats const &stats,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
DisServingRequestStats const &disServingRequestStats,
+
+ +)#
+
+ +
+
+static RequestStage deserializeRequestStage(std::istream &is)#
+
+ +
+
+static void serialize( + +
+
RequestStage const &requestStage,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static size_t serializedSize(RequestStage const &requestStage)#
+
+ +
+
+static RequestStats deserializeRequestStats(std::istream &is)#
+
+ +
+
+static void serialize(RequestStats const &state, std::ostream &os)#
+
+ +
+
+static size_t serializedSize(RequestStats const &state)#
+
+ +
+
+static RequestStatsPerIteration deserializeRequestStatsPerIteration( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static RequestStatsPerIteration deserializeRequestStatsPerIteration( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
RequestStatsPerIteration const &state,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static std::vector<char> serialize( + +
+
RequestStatsPerIteration const &state,
+
+ +)#
+
+ +
+
+static size_t serializedSize(RequestStatsPerIteration const &state)#
+
+ +
+
+static std::vector<char> serialize( + +
+
std::vector<RequestStatsPerIteration> const &requestStatsVec,
+
+ +)#
+
+ +
+
+static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static std::vector<char> serialize( + +
+
std::deque<KVCacheEvent> const &kvCacheEvents,
+
+ +)#
+
+ +
+
+static std::deque<KVCacheEvent> deserializeKVCacheEvents( + +
+
std::vector<char> &buffer,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KVCacheEvent const &event)#
+
+ +
+
+static void serialize(KVCacheEvent const &event, std::ostream &os)#
+
+ +
+
+static KVCacheEvent deserializeKVCacheEvent(std::istream &is)#
+
+ +
+
+static size_t serializedSize(KVCacheCreatedData const &data)#
+
+ +
+
+static void serialize( + +
+
KVCacheCreatedData const &data,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static KVCacheCreatedData deserializeKVCacheCreatedData( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KVCacheStoredData const &data)#
+
+ +
+
+static void serialize( + +
+
KVCacheStoredData const &data,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static KVCacheStoredData deserializeKVCacheStoredData( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KVCacheStoredBlockData const &data)#
+
+ +
+
+static void serialize( + +
+
KVCacheStoredBlockData const &data,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static KVCacheStoredBlockData deserializeKVCacheStoredBlockData( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KVCacheRemovedData const &data)#
+
+ +
+
+static void serialize( + +
+
KVCacheRemovedData const &data,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static KVCacheRemovedData deserializeKVCacheRemovedData( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+template<typename T>
static size_t serializedSize( + +
+
KVCacheEventDiff<T> const &data,
+
+ +)#
+
+ +
+
+template<typename T>
static void serialize( + +
+
KVCacheEventDiff<T> const &data,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+template<typename T>
static KVCacheEventDiff<T> deserializeKVCacheEventDiff( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static size_t serializedSize(KVCacheUpdatedData const &data)#
+
+ +
+
+static void serialize( + +
+
KVCacheUpdatedData const &data,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static KVCacheUpdatedData deserializeKVCacheUpdatedData( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static size_t serializedSize( + +
+
tensorrt_llm::runtime::UniqueToken const &token,
+
+ +)#
+
+ +
+
+static void serialize( + +
+
tensorrt_llm::runtime::UniqueToken const &token,
+
std::ostream &os,
+
+ +)#
+
+ +
+
+static tensorrt_llm::runtime::UniqueToken deserializeUniqueToken( + +
+
std::istream &is,
+
+ +)#
+
+ +
+
+static std::string deserializeString(std::istream &is)#
+
+ +
+
+static bool deserializeBool(std::istream &is)#
+
+ +
+
+static ModelType deserializeModelType(std::istream &is)#
+
+ +
+
+ +
+
+namespace kv_cache
+
+ +
+ +
+

types.h#

@@ -3029,879 +5515,6 @@ -
-
-namespace runtime#
-
- - - -
-
-

cacheCommunicator.h#

-
-
-namespace tensorrt_llm
-
-
-namespace executor
-
-
-namespace kv_cache
-
-
-class Connection#
-
-

Public Functions

-
-
-virtual ~Connection() = default#
-
- -
-
-virtual void send( - -
-
DataContext const &ctx,
-
void const *data,
-
size_t size,
-
- -) const = 0#
-
- -
-
-virtual void recv( - -
-
DataContext const &ctx,
-
void *data,
-
size_t size,
-
- -) const = 0#
-
- -
-
-inline virtual bool isThreadSafe() const noexcept#
-
- -
-
- -
-
-class ConnectionManager#
-
-

Public Functions

-
-
-virtual ~ConnectionManager() = default#
-
- -
-
-virtual Connection const *recvConnect( - -
-
DataContext const &ctx,
-
void *data,
-
size_t size,
-
- -) = 0#
-
- -
-
-virtual std::vector<Connection const*> getConnections( - -
-
CommState const &state,
-
- -) = 0#
-
- -
-
-virtual CommState const &getCommState() const = 0#
-
- -
-
- -
-
-struct DataContext#
-
-

Public Functions

-
-
-inline explicit DataContext(int tag)#
-
- -
-
-inline int getTag() const noexcept#
-
- -
-
-

Private Members

-
-
-int const mTag#
-
- -
-
- -
- -
- -
- -
-
-

disaggServerUtil.h#

-
-
-namespace tensorrt_llm
-
-
-namespace executor
-
-
-namespace disagg_executor#
-
-
-class DisaggExecutorOrchestrator#
-
-

Public Functions

-
-
-DisaggExecutorOrchestrator( - -
-
std::vector<std::filesystem::path> const &ctxEnginePaths,
-
std::vector<std::filesystem::path> const &genEnginePaths,
-
std::vector<executor::ExecutorConfig> const &ctxExecutorConfigs,
-
std::vector<executor::ExecutorConfig> const &genExecutorConfigs,
-
bool hasContextAwaitThreads,
-
bool hasGenAwaitThreads,
-
- -)#
-

Constructs a DisaggExecutorOrchestrator object.

-
-
Parameters:
-
    -
  • ctxEnginePaths – A vector of file paths to context engine files.

  • -
  • genEnginePaths – A vector of file paths to generation engine files.

  • -
  • ctxExecutorConfigs – A vector of ExecutorConfig for context executors.

  • -
  • genExecutorConfigs – A vector of ExecutorConfig for generation executors.

  • -
  • hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.

  • -
  • hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.

  • -
-
-
-
- -
-
-std::vector<IdType> enqueueContext( - -
-
std::vector<texec::Request> const &requests,
-
std::optional<int> selectContextId = std::nullopt,
-
bool batch = false,
-
- -)#
-

Enqueue context-only requests to context executors.

-
-
Parameters:
-
    -
  • requests – A vector of context-only requests.

  • -
  • selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.

  • -
  • batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.

  • -
-
-
Returns:
-

A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.

-
-
-
- -
-
-void enqueueGeneration( - -
-
std::vector<texec::Request> const &requests,
-
std::vector<IdType> const &globalRequestIds,
-
std::optional<int> selectGenIdx = std::nullopt,
-
bool batch = false,
-
- -)#
-

Enqueue generation-only requests to generation executors.

-
-
Parameters:
-
    -
  • requests – A vector of generation-only requests.

  • -
  • globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.

  • -
  • selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.

  • -
  • batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.

  • -
-
-
-
- -
-
-std::vector<ResponseWithId> awaitContextResponses( - -
-
std::optional<std::chrono::milliseconds> const &timeout,
-
std::optional<int> contextIdx = std::nullopt,
-
- -)#
-

Await for context responses.

-
-
Parameters:
-
    -
  • timeout – The maximum time to wait for new responses

  • -
  • contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.

  • -
-
-
Returns:
-

A vector of responses with corresponding global request ids

-
-
-
- -
-
-std::vector<ResponseWithId> awaitGenerationResponses( - -
-
std::optional<std::chrono::milliseconds> const &timeout,
-
std::optional<int> genIdx = std::nullopt,
-
- -)#
-

Await for generation responses.

-
-
Parameters:
-
    -
  • timeout – The maximum time to wait for new responses.

  • -
  • genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.

  • -
-
-
Returns:
-

A vector of responses with corresponding global request ids.

-
-
-
- -
-
-bool canEnqueue() const#
-

Indicates if the current process is allowed to enqueueRequests.

-
- -
-
-std::vector<std::unique_ptr<texec::Executor>> const &getContextExecutors( - -
-
- -) const#
-

Get context executors.

-
- -
-
-std::vector<std::unique_ptr<texec::Executor>> const &getGenExecutors( - -
-
- -) const#
-

Get generation executors.

-
- -
-
-~DisaggExecutorOrchestrator()#
-
- -
-
-

Private Members

-
-
-std::unique_ptr<Impl> mImpl#
-
- -
-
- -
-
-struct ResponseWithId#
-
-

Public Functions

-
-
-inline ResponseWithId( - -
-
tensorrt_llm::executor::Response &&response,
-
IdType gid,
-
- -)#
-
- -
-
-inline ResponseWithId( - -
-
tensorrt_llm::executor::Response const &response,
-
IdType gid,
-
- -)#
-
- -
-
-inline ResponseWithId(ResponseWithId &&other) noexcept#
-
- -
-
-ResponseWithId(ResponseWithId const &other) = default#
-
- -
-
-inline ResponseWithId &operator=(ResponseWithId &&other) noexcept#
-
- -
-
-inline ResponseWithId &operator=(ResponseWithId const &other)#
-
- -
-
-~ResponseWithId() = default#
-
- -
-
-

Public Members

-
-
-tensorrt_llm::executor::Response response#
-
- -
-
-IdType gid#
-
- -
-
- -
- -
- -
- -
-
-

tensor.h#

-
-
-namespace tensorrt_llm
-
-
-namespace executor
-
-
-class Shape : public tensorrt_llm::common::ArrayView<detail::DimType64 const>#
-
-

Public Types

-
-
-using Base = tensorrt_llm::common::ArrayView<detail::DimType64 const>#
-
- -
-
-using DimType64 = typename std::remove_cv_t<Base::value_type>#
-
- -
-
-

Public Functions

-
-
-inline Shape()#
-
- -
-
-inline Shape(DimType64 const *data, Base::size_type size)#
-
- -
-
-inline Shape(std::initializer_list<DimType64> dims)#
-
- -
-
- -
-
-class Tensor#
-
-

Public Types

-
-
-using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>#
-
- -
-
-

Public Functions

-
-
-Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const#
-
- -
-
-Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const#
-
- -
-
-Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const#
-
- -
-
-Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const#
-
- -
-
-Tensor copyToGpu(Tensor::CudaStreamPtr stream) const#
-
- -
-
-Tensor() noexcept = default#
-
- -
-
-~Tensor() = default#
-
- -
-
-Tensor(Tensor const &other) noexcept = default#
-
- -
-
-Tensor(Tensor &&other) noexcept = default#
-
- -
-
-Tensor &operator=(Tensor const &other) noexcept = default#
-
- -
-
-Tensor &operator=(Tensor &&other) noexcept = default#
-
- -
-
-void *getData()#
-

Returns a pointer to underlying array.

-
- -
-
-void const *getData() const#
-

Returns a pointer to underlying array.

-
- -
-
-DataType getDataType() const#
-

Returns the data type of the buffer.

-
- -
-
-MemoryType getMemoryType() const#
-

Returns the memory type of the buffer.

-
- -
-
-Shape getShape() const#
-

Returns the tensor dimensions.

-
- -
-
-std::size_t getSize() const#
-

Returns the number of elements in the tensor.

-
- -
-
-std::size_t getSizeInBytes() const#
-

Returns the size of the tensor in bytes.

-
- -
-
-void setZero(CudaStreamPtr stream = nullptr)#
-

Set the entire memory to zero.

-
-
Parameters:
-

stream – Must be a valid CUDA stream if the memory type is GPU.

-
-
-
- -
-
-void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)#
-

Copy the data and shape from another tensor.

-
-
Parameters:
-
    -
  • other – A tensor to copy from.

  • -
  • stream – Must be a valid CUDA stream if the memory type is GPU.

  • -
-
-
-
- -
-
-inline explicit operator bool() const#
-
- -
-
-inline bool operator==(Tensor const &rhs) const#
-
- -
-
-inline bool operator!=(Tensor const &rhs) const#
-
- -
-
-

Public Static Functions

-
-
-static Tensor cpu(DataType dataType, Shape shape = {})#
-

Allocate a cpu tensor with the given shape and data type.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor cpu(Shape shape = {})#
-
- -
-
-static Tensor pinned(DataType dataType, Shape shape = {})#
-

Allocate a cpu tensor in pinned memory with the given shape and data type.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor pinned(Shape shape = {})#
-
- -
-
-static Tensor pooledPinned(DataType dataType, Shape shape = {})#
-

Allocate a cpu tensor in pooled pinned memory with the given shape and data type.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor pooledPinned( - -
-
Shape shape = {},
-
- -)#
-
- -
-
-static Tensor managed(DataType dataType, Shape shape = {})#
-

Allocate a tensor in managed memory (UVM) with the given shape and data type.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor managed(Shape shape = {})#
-
- -
-
-static Tensor gpu( - -
-
DataType dataType,
-
CudaStreamPtr stream,
-
Shape shape = {},
-
- -)#
-

Allocate a gpu tensor with the given shape and data type on a particular cuda stream.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • -
  • dataType – The data type of the tensor.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor gpu( - -
-
CudaStreamPtr stream,
-
Shape shape = {},
-
- -)#
-
- -
-
-static Tensor of(DataType dataType, void *data, Shape shape)#
-

Wrap a data pointer into a tensor without taking ownership.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor of(T *data, Shape shape)#
-

Wrap a data pointer into a tensor without taking ownership.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • -
-
-
-
- -
-
-template<typename T>
static inline Tensor of(T &data)#
-

Wrap any container into a tensor without taking ownership.

-
-
Parameters:
-
    -
  • shape – The shape of the tensor.

  • -
  • dataType – The data type of the tensor.

  • -
  • stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

  • -
-
-
-
- -
-
-

Private Types

-
-
-using Impl = runtime::ITensor#
-
- -
-
-

Private Functions

-
-
-explicit Tensor(std::shared_ptr<runtime::ITensor> tensor)#
-
- -
-
-Tensor copyTo( - -
-
std::shared_ptr<Impl> tensor,
-
CudaStreamPtr stream,
-
- -) const#
-
- -
-
-

Private Members

-
-
-std::shared_ptr<Impl> mTensor#
-
- -
-
-

Private Static Functions

-
-
-template<typename T>
static inline DataType getRuntimeType()#
-
- -
-
-

Friends

-
-
-friend class Serialization
-
- -
-
-friend std::shared_ptr<runtime::ITensor> const &toITensor( - -
-
Tensor const &tensor,
-
- -)#
-
- -
-
-friend Tensor ofITensor( - -
-
std::shared_ptr<runtime::ITensor> tensor,
-
- -)#
-
- -
-
- -
-
-namespace detail#
-
-

Typedefs

-
-
-using DimType64 = int64_t#
-
- -
-
-

Functions

-
-
-std::shared_ptr<runtime::ITensor> const &toITensor( - -
-
Tensor const &tensor,
-
- -)#
-
- -
-
-Tensor ofITensor(std::shared_ptr<runtime::ITensor> tensor)#
-
- -
-
- -
-
namespace runtime
@@ -4084,16 +5697,17 @@

Public Functions

-
-explicit CacheTransceiverConfig( +
+explicit CacheTransceiverConfig(
std::optional<BackendType> backendType = std::nullopt,
std::optional<size_t> maxNumTokens = std::nullopt,
std::optional<int> kvTransferTimeoutMs = std::nullopt,
+
std::optional<int> kvTransferSenderFutureTimeoutMs = std::nullopt,
-)#
+)#
@@ -4117,8 +5731,14 @@
-
-std::optional<int> getKvTransferTimeoutMs() const#
+
+void setKvTransferSenderFutureTimeoutMs( + +
+
std::optional<int> kvTransferSenderFutureTimeoutMs,
+
+ +)#
@@ -4131,6 +5751,16 @@ std::optional<BackendType> getBackendType() const#
+
+
+std::optional<int> getKvTransferTimeoutMs() const#
+
+ +
+
+std::optional<int> getKvTransferSenderFutureTimeoutMs() const#
+
+

Private Members

@@ -4150,6 +5780,11 @@ std::optional<int> mKvTransferTimeoutMs#
+
+
+std::optional<int> mKvTransferSenderFutureTimeoutMs#
+
+
@@ -9821,8 +11456,8 @@

Public Functions

-
-inline CacheState( +
+inline CacheState(
ModelConfig modelConfig,
@@ -9832,14 +11467,17 @@
AttentionType attentionType = AttentionType::kDEFAULT,
int kvFactor = 2,
bool enableBlockReuse = false,
+
bool hasIndexerKCache = false,
+
SizeType32 indexerDimPerHead = 0,
+
SizeType32 indexerKCacheQuantBlockSize = 128,
-)#
+)#
-
-inline CacheState( +
+inline CacheState(
std::vector<SizeType32> nbKvHeadPerLayer,
@@ -9856,14 +11494,17 @@
int DPrank = 0,
int DPsize = 0,
bool enableBlockReuse = false,
+
bool hasIndexerKCache = false,
+
SizeType32 indexerDimPerHead = 0,
+
SizeType32 indexerKCacheQuantBlockSize = 128,
-)#
+)#
-
-inline CacheState( +
+inline CacheState(
SizeType32 nbAttentionLayers,
@@ -9881,9 +11522,12 @@
int DPrank = 0,
int DPsize = 0,
bool enableBlockReuse = false,
+
bool hasIndexerKCache = false,
+
SizeType32 indexerDimPerHead = 0,
+
SizeType32 indexerKCacheQuantBlockSize = 128,
-)#
+)#
@@ -9922,6 +11566,21 @@ inline bool getEnableBlockReuse() const#
+
+
+inline bool getHasIndexerKCache() const#
+
+ +
+
+inline SizeType32 getIndexerDimPerHead() const#
+
+ +
+
+inline SizeType32 getIndexerKCacheQuantBlockSize() const#
+
+
inline std::string toString() const#
@@ -9955,6 +11614,21 @@ bool mEnableBlockReuse = {false}#
+
+
+bool mHasIndexerKCache = {false}#
+
+ +
+
+SizeType32 mIndexerDimPerHead = {0}#
+
+ +
+
+SizeType32 mIndexerKCacheQuantBlockSize = {128}#
+
+

Friends

@@ -10263,1736 +11937,131 @@
-
-

serialization.h#

+
+

cacheCommunicator.h#

namespace tensorrt_llm
namespace executor
+
+
+namespace kv_cache
-
-class Serialization#
+
+class Connection#
-

Public Static Functions

+

Public Functions

-
-static size_t serializedSize( - -
-
tensorrt_llm::batch_manager::kv_cache_manager::BlockKey const &key,
-
- -)#
-
- -
-
-static void serialize( - -
-
tensorrt_llm::batch_manager::kv_cache_manager::BlockKey const &key,
-
std::ostream &os,
-
- -)#
-
- -
-
-static tensorrt_llm::batch_manager::kv_cache_manager::BlockKey deserializeBlockKey( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static RequestPerfMetrics::TimePoint deserializeTimePoint( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
RequestPerfMetrics::TimePoint const &tp,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(RequestPerfMetrics::TimePoint const&)#
-
- -
-
-static RequestPerfMetrics deserializeRequestPerfMetrics( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
RequestPerfMetrics const &metrics,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(RequestPerfMetrics const &metrics)#
-
- -
-
-static SamplingConfig deserializeSamplingConfig(std::istream &is)#
-
- -
-
-static void serialize(SamplingConfig const &config, std::ostream &os)#
-
- -
-
-static size_t serializedSize(SamplingConfig const &config)#
-
- -
-
-static OutputConfig deserializeOutputConfig(std::istream &is)#
-
- -
-
-static void serialize(OutputConfig const &config, std::ostream &os)#
-
- -
-
-static size_t serializedSize(OutputConfig const &config)#
-
- -
-
-static AdditionalModelOutput deserializeAdditionalModelOutput( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
AdditionalModelOutput const &additionalModelOutput,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
AdditionalModelOutput const &additionalModelOutput,
-
- -)#
-
- -
-
-static ExternalDraftTokensConfig deserializeExternalDraftTokensConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
ExternalDraftTokensConfig const &config,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(ExternalDraftTokensConfig const &config)#
-
- -
-
-static PromptTuningConfig deserializePromptTuningConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
PromptTuningConfig const &config,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(PromptTuningConfig const &config)#
-
- -
-
-static MultimodalInput deserializeMultimodalInput(std::istream &is)#
-
- -
-
-static void serialize( - -
-
MultimodalInput const &multimodalInput,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(MultimodalInput const &multimodalInput)#
-
- -
-
-static MropeConfig deserializeMropeConfig(std::istream &is)#
-
- -
-
-static void serialize(MropeConfig const &config, std::ostream &os)#
-
- -
-
-static size_t serializedSize(MropeConfig const &config)#
-
- -
-
-static LoraConfig deserializeLoraConfig(std::istream &is)#
-
- -
-
-static void serialize(LoraConfig const &config, std::ostream &os)#
-
- -
-
-static size_t serializedSize(LoraConfig const &config)#
-
- -
-
-static kv_cache::CommState deserializeCommState(std::istream &is)#
-
- -
-
-static void serialize( - -
-
kv_cache::CommState const &state,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(kv_cache::CommState const &state)#
-
- -
-
-static kv_cache::SocketState deserializeSocketState(std::istream &is)#
-
- -
-
-static void serialize( - -
-
kv_cache::SocketState const &state,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(kv_cache::SocketState const &state)#
-
- -
-
-static kv_cache::AgentState deserializeAgentState(std::istream &is)#
-
- -
-
-static void serialize( - -
-
kv_cache::AgentState const &state,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(kv_cache::AgentState const &state)#
-
- -
-
-static kv_cache::CacheState deserializeCacheState(std::istream &is)#
-
- -
-
-static void serialize( - -
-
kv_cache::CacheState const &state,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(kv_cache::CacheState const &state)#
-
- -
-
-static DataTransceiverState deserializeDataTransceiverState( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static DataTransceiverState deserializeDataTransceiverState( - -
-
std::vector<char> &buffer,
-
- -)#
-
- -
-
-static void serialize( - -
-
DataTransceiverState const &dataTransceiverState,
-
std::ostream &os,
-
- -)#
-
- -
-
-static std::vector<char> serialize( - -
-
DataTransceiverState const &dataTransceiverState,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
DataTransceiverState const &dataTransceiverState,
-
- -)#
-
- -
-
-static ContextPhaseParams deserializeContextPhaseParams( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
ContextPhaseParams const &contextPhaseParams,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
ContextPhaseParams const &contextPhaseParams,
-
- -)#
-
- -
-
-static Request deserializeRequest(std::istream &is)#
-
- -
-
-static void serialize(Request const &request, std::ostream &os)#
-
- -
-
-static size_t serializedSize(Request const &request)#
-
- -
-
-static Tensor deserializeTensor(std::istream &is)#
-
- -
-
-static void serialize(Tensor const &tensor, std::ostream &os)#
-
- -
-
-static size_t serializedSize(Tensor const &tensor)#
-
- -
-
-static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
SpeculativeDecodingFastLogitsInfo const &info,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
SpeculativeDecodingFastLogitsInfo const &info,
-
- -)#
-
- -
-
-static Result deserializeResult(std::istream &is)#
-
- -
-
-static void serialize(Result const &result, std::ostream &os)#
-
- -
-
-static size_t serializedSize(Result const &result)#
-
- -
-
-static AdditionalOutput deserializeAdditionalOutput(std::istream &is)#
-
- -
-
-static void serialize( - -
-
AdditionalOutput const &additionalOutput,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
AdditionalOutput const &additionalOutput,
-
- -)#
-
- -
-
-static Response deserializeResponse(std::istream &is)#
-
- -
-
-static void serialize(Response const &response, std::ostream &os)#
-
- -
-
-static size_t serializedSize(Response const &response)#
-
- -
-
-static std::vector<Response> deserializeResponses( - -
-
std::vector<char> &buffer,
-
- -)#
-
- -
-
-static std::vector<char> serialize( - -
-
std::vector<Response> const &responses,
-
- -)#
-
- -
-
-static KvCacheConfig deserializeKvCacheConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
KvCacheConfig const &kvCacheConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(KvCacheConfig const &kvCacheConfig)#
-
- -
-
-static DynamicBatchConfig deserializeDynamicBatchConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
DynamicBatchConfig const &dynamicBatchConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
DynamicBatchConfig const &dynamicBatchConfig,
-
- -)#
-
- -
-
-static SchedulerConfig deserializeSchedulerConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
SchedulerConfig const &schedulerConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(SchedulerConfig const &schedulerConfig)#
-
- -
-
-static ExtendedRuntimePerfKnobConfig deserializeExtendedRuntimePerfKnobConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
ExtendedRuntimePerfKnobConfig const &extendedRuntimePerfKnobConfig,
-
- -)#
-
- -
-
-static ParallelConfig deserializeParallelConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
ParallelConfig const &parallelConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(ParallelConfig const &parallelConfig)#
-
- -
-
-static PeftCacheConfig deserializePeftCacheConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
PeftCacheConfig const &peftCacheConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(PeftCacheConfig const &peftCacheConfig)#
-
- -
-
-static OrchestratorConfig deserializeOrchestratorConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
OrchestratorConfig const &orchestratorConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
OrchestratorConfig const &orchestratorConfig,
-
- -)#
-
- -
-
-static DecodingMode deserializeDecodingMode(std::istream &is)#
-
- -
-
-static void serialize( - -
-
DecodingMode const &decodingMode,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(DecodingMode const &decodingMode)#
-
- -
-
-static LookaheadDecodingConfig deserializeLookaheadDecodingConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
LookaheadDecodingConfig const &lookaheadDecodingConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
LookaheadDecodingConfig const &lookaheadDecodingConfig,
-
- -)#
-
- -
-
-static EagleConfig deserializeEagleConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
EagleConfig const &eagleConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(EagleConfig const &eagleConfig)#
-
- -
-
-static SpeculativeDecodingConfig deserializeSpeculativeDecodingConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
SpeculativeDecodingConfig const &specDecConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
SpeculativeDecodingConfig const &specDecConfig,
-
- -)#
-
- -
-
-static GuidedDecodingConfig deserializeGuidedDecodingConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
GuidedDecodingConfig const &guidedDecodingConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
GuidedDecodingConfig const &guidedDecodingConfig,
-
- -)#
-
- -
-
-static GuidedDecodingParams deserializeGuidedDecodingParams( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
GuidedDecodingParams const &guidedDecodingParams,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
GuidedDecodingParams const &guidedDecodingParams,
-
- -)#
-
- -
-
-static KvCacheRetentionConfig deserializeKvCacheRetentionConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
KvCacheRetentionConfig const &kvCacheRetentionConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
KvCacheRetentionConfig const &kvCacheRetentionConfig,
-
- -)#
-
- -
-
-static KvCacheRetentionConfig::TokenRangeRetentionConfig deserializeTokenRangeRetentionConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
KvCacheRetentionConfig::TokenRangeRetentionConfig const &tokenRangeRetentionConfig,
-
- -)#
-
- -
-
-static DecodingConfig deserializeDecodingConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
DecodingConfig const &decodingConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(DecodingConfig const &decodingConfig)#
-
- -
-
-static DebugConfig deserializeDebugConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
DebugConfig const &debugConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(DebugConfig const &debugConfig)#
-
- -
-
-static CacheTransceiverConfig deserializeCacheTransceiverConfig( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
CacheTransceiverConfig const &cacheTransceiverConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
CacheTransceiverConfig const &cacheTransceiverConfig,
-
- -)#
-
- -
-
-static ExecutorConfig deserializeExecutorConfig(std::istream &is)#
-
- -
-
-static void serialize( - -
-
ExecutorConfig const &executorConfig,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(ExecutorConfig const &executorConfig)#
-
- -
-
-static KvCacheStats deserializeKvCacheStats(std::istream &is)#
-
- -
-
-static void serialize( - -
-
KvCacheStats const &kvCacheStats,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(KvCacheStats const &kvCacheStats)#
-
- -
-
-static StaticBatchingStats deserializeStaticBatchingStats( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
StaticBatchingStats const &staticBatchingStats,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
StaticBatchingStats const &staticBatchingStats,
-
- -)#
-
- -
-
-static InflightBatchingStats deserializeInflightBatchingStats( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
InflightBatchingStats const &inflightBatchingStats,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
InflightBatchingStats const &inflightBatchingStats,
-
- -)#
-
- -
-
-static SpecDecodingStats deserializeSpecDecodingStats( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
SpecDecodingStats const &specDecodingStats,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
SpecDecodingStats const &specDecodingStats,
-
- -)#
-
- -
-
-static IterationStats deserializeIterationStats( - -
-
std::vector<char> &buffer,
-
- -)#
-
- -
-
-static IterationStats deserializeIterationStats(std::istream &is)#
-
- -
-
-static void serialize( - -
-
IterationStats const &iterStats,
-
std::ostream &os,
-
- -)#
-
- -
-
-static std::vector<char> serialize(IterationStats const &iterStats)#
-
- -
-
-static size_t serializedSize(IterationStats const &iterStats)#
-
- -
-
-static std::vector<char> serialize( - -
-
std::vector<IterationStats> const &iterStatsVec,
-
- -)#
-
- -
-
-static std::vector<IterationStats> deserializeIterationStatsVec( - -
-
std::vector<char> &buffer,
-
- -)#
-
- -
-
-static DisServingRequestStats deserializeDisServingRequestStats( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static void serialize( - -
-
DisServingRequestStats const &stats,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
DisServingRequestStats const &disServingRequestStats,
-
- -)#
-
- -
-
-static RequestStage deserializeRequestStage(std::istream &is)#
-
- -
-
-static void serialize( - -
-
RequestStage const &requestStage,
-
std::ostream &os,
-
- -)#
-
- -
-
-static size_t serializedSize(RequestStage const &requestStage)#
-
- -
-
-static RequestStats deserializeRequestStats(std::istream &is)#
-
- -
-
-static void serialize(RequestStats const &state, std::ostream &os)#
-
- -
-
-static size_t serializedSize(RequestStats const &state)#
-
- -
-
-static RequestStatsPerIteration deserializeRequestStatsPerIteration( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static RequestStatsPerIteration deserializeRequestStatsPerIteration( - -
-
std::vector<char> &buffer,
-
- -)#
-
- -
-
-static void serialize( - -
-
RequestStatsPerIteration const &state,
-
std::ostream &os,
-
- -)#
-
- -
-
-static std::vector<char> serialize( - -
-
RequestStatsPerIteration const &state,
-
- -)#
-
- -
-
-static size_t serializedSize(RequestStatsPerIteration const &state)#
-
- -
-
-static std::vector<char> serialize( - -
-
std::vector<RequestStatsPerIteration> const &requestStatsVec,
-
- -)#
-
- -
-
-static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec( - -
-
std::vector<char> &buffer,
-
- -)#
-
- -
-
-static std::vector<char> serialize( - -
-
std::deque<KVCacheEvent> const &kvCacheEvents,
-
- -)#
-
- -
-
-static std::deque<KVCacheEvent> deserializeKVCacheEvents( - -
-
std::vector<char> &buffer,
-
- -)#
+
+virtual ~Connection() = default#
-
-static size_t serializedSize(KVCacheEvent const &event)#
-
- -
-
-static void serialize(KVCacheEvent const &event, std::ostream &os)#
-
- -
-
-static KVCacheEvent deserializeKVCacheEvent(std::istream &is)#
-
- -
-
-static size_t serializedSize(KVCacheCreatedData const &data)#
-
- -
-
-static void serialize( - -
-
KVCacheCreatedData const &data,
-
std::ostream &os,
-
- -)#
-
- -
-
-static KVCacheCreatedData deserializeKVCacheCreatedData( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static size_t serializedSize(KVCacheStoredData const &data)#
-
- -
-
-static void serialize( - -
-
KVCacheStoredData const &data,
-
std::ostream &os,
-
- -)#
-
- -
-
-static KVCacheStoredData deserializeKVCacheStoredData( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static size_t serializedSize(KVCacheStoredBlockData const &data)#
-
- -
-
-static void serialize( - -
-
KVCacheStoredBlockData const &data,
-
std::ostream &os,
-
- -)#
-
- -
-
-static KVCacheStoredBlockData deserializeKVCacheStoredBlockData( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static size_t serializedSize(KVCacheRemovedData const &data)#
-
- -
-
-static void serialize( - -
-
KVCacheRemovedData const &data,
-
std::ostream &os,
-
- -)#
-
- -
-
-static KVCacheRemovedData deserializeKVCacheRemovedData( - -
-
std::istream &is,
-
- -)#
-
- -
-
-template<typename T>
static size_t serializedSize( - -
-
KVCacheEventDiff<T> const &data,
-
- -)#
-
- -
-
-template<typename T>
static void serialize( - -
-
KVCacheEventDiff<T> const &data,
-
std::ostream &os,
-
- -)#
-
- -
-
-template<typename T>
static KVCacheEventDiff<T> deserializeKVCacheEventDiff( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static size_t serializedSize(KVCacheUpdatedData const &data)#
-
- -
-
-static void serialize( - -
-
KVCacheUpdatedData const &data,
-
std::ostream &os,
-
- -)#
-
- -
-
-static KVCacheUpdatedData deserializeKVCacheUpdatedData( - -
-
std::istream &is,
-
- -)#
-
- -
-
-static size_t serializedSize( - -
-
tensorrt_llm::runtime::UniqueToken const &token,
-
- -)#
-
- -
-
-static void serialize( +
+virtual void send(
-
tensorrt_llm::runtime::UniqueToken const &token,
-
std::ostream &os,
+
DataContext const &ctx,
+
void const *data,
+
size_t size,
-)#
+) const = 0#
-
-static tensorrt_llm::runtime::UniqueToken deserializeUniqueToken( +
+virtual void recv(
-
std::istream &is,
+
DataContext const &ctx,
+
void *data,
+
size_t size,
-)#
-
- -
-
-static std::string deserializeString(std::istream &is)#
-
- -
-
-static bool deserializeBool(std::istream &is)#
+) const = 0#
-
-static ModelType deserializeModelType(std::istream &is)#
+
+inline virtual bool isThreadSafe() const noexcept#
-
-
-namespace kv_cache
+
+
+class ConnectionManager#
+
+

Public Functions

+
+
+virtual ~ConnectionManager() = default#
+
+
+virtual Connection const *recvConnect( + +
+
DataContext const &ctx,
+
void *data,
+
size_t size,
+
+ +) = 0#
+
+ +
+
+virtual std::vector<Connection const*> getConnections( + +
+
CommState const &state,
+
+ +) = 0#
+
+ +
+
+virtual CommState const &getCommState() const = 0#
+
+ +
+
+ +
+
+struct DataContext#
+
+

Public Functions

+
+
+inline explicit DataContext(int tag)#
+
+ +
+
+inline int getTag() const noexcept#
+
+ +
+
+

Private Members

+
+
+int const mTag#
+
+ +
+
+ +
+
@@ -12031,134 +12100,33 @@
+
+

bufferManager.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class BufferManager#
+
+#include <bufferManager.h>
+

A helper class for managing memory on host and device.

+
+

Public Types

+
+
+using IBufferPtr = IBuffer::UniquePtr#
+
+ +
+
+using ITensorPtr = ITensor::UniquePtr#
+
+ +
+
+using CudaStreamPtr = std::shared_ptr<CudaStream>#
+
+ +
+
+using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>#
+
+ +
+
+

Public Functions

+
+
+explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)#
+

Construct a BufferManager.

+
+
Parameters:
+

cudaStream[in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

+
+
+
+ +
+
+inline ~BufferManager()#
+

Destructor.

+
+ +
+
+IBufferPtr gpu( + +
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +) const#
+

Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.

+
+ +
+
+ITensorPtr gpu( + +
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +) const#
+

Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.

+
+ +
+
+IBufferPtr allocate( + +
+
MemoryType memoryType,
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +) const#
+

Allocates an IBuffer of the given size and memory type.

+
+ +
+
+ITensorPtr allocate( + +
+
MemoryType memoryType,
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +) const#
+

Allocates an ITensor of the given dimensions and memory type.

+
+ +
+
+inline IBufferPtr emptyBuffer( + +
+
MemoryType memoryType,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +) const#
+

Create an empty IBuffer of the given memory type. It may be resized later.

+
+ +
+
+inline ITensorPtr emptyTensor( + +
+
MemoryType memoryType,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +) const#
+

Create an empty ITensor of the given memory type. It may be reshaped later.

+
+ +
+
+void setMem(IBuffer &buffer, int32_t value) const#
+

Set the contents of the given buffer to value.

+
+ +
+
+void setZero(IBuffer &buffer) const#
+

Set the contents of the given buffer to zero.

+
+ +
+
+void copy(void const *src, IBuffer &dst, MemoryType srcType) const#
+

Copy src to dst.

+
+ +
+
+void copy(IBuffer const &src, void *dst, MemoryType dstType) const#
+

Copy src to dst.

+
+ +
+
+inline void copy(void const *src, IBuffer &dst) const#
+

Copy src to dst.

+
+ +
+
+inline void copy(IBuffer const &src, void *dst) const#
+

Copy src to dst.

+
+ +
+
+void copy(IBuffer const &src, IBuffer &dst) const#
+

Copy src to dst.

+
+ +
+
+IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const#
+

Copy src into a new IBuffer with a potentially different memory type.

+
+ +
+
+ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const#
+

Copy src into a new ITensor with a potentially different memory type.

+
+ +
+
+template<typename T>
inline IBufferPtr copyFrom( + +
+
std::vector<T> const &src,
+
MemoryType memoryType,
+
+ +) const#
+

Copy src into a new IBuffer with a potentially different memory type.

+
+ +
+
+template<typename T>
inline ITensorPtr copyFrom( + +
+
T *src,
+
nvinfer1::Dims dims,
+
MemoryType memoryType,
+
+ +) const#
+

Copy src into a new ITensor with a potentially different memory type.

+
+ +
+
+template<typename T>
inline ITensorPtr copyFrom( + +
+
std::vector<T> const &src,
+
nvinfer1::Dims dims,
+
MemoryType memoryType,
+
+ +) const#
+

Copy src into a new ITensor with a potentially different memory type.

+
+ +
+
+CudaStream const &getStream() const#
+

Get the underlying cuda stream.

+
+ +
+
+std::size_t memoryPoolReserved() const#
+

The current size of the memory reserved by the memory pool.

+
+ +
+
+std::size_t memoryPoolUsed() const#
+

The current size of the memory used by the memory pool.

+
+ +
+
+std::size_t memoryPoolFree() const#
+

The current size of the memory free in the memory pool.

+
+ +
+
+void memoryPoolTrimTo(std::size_t size)#
+

Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

+
+ +
+
+

Public Static Functions

+
+
+static IBufferPtr gpuSync( + +
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates an IBuffer of the given size on the GPU, using cudaMalloc.

+
+ +
+
+static ITensorPtr gpuSync( + +
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.

+
+ +
+
+static IBufferPtr cpu( + +
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates an IBuffer of the given size on the CPU.

+
+ +
+
+static ITensorPtr cpu( + +
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates an ITensor of the given dimensions on the CPU.

+
+ +
+
+static IBufferPtr pinned( + +
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates a pinned IBuffer of the given size on the CPU.

+
+ +
+
+static ITensorPtr pinned( + +
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates a pinned ITensor of the given dimensions on the CPU.

+
+ +
+
+static IBufferPtr pinnedPool( + +
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

+
+ +
+
+static ITensorPtr pinnedPool( + +
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

+
+ +
+
+static IBufferPtr managed( + +
+
std::size_t size,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates an IBuffer of the given size in UVM.

+
+ +
+
+static ITensorPtr managed( + +
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type = kBYTE_TYPE,
+
+ +)#
+

Allocates an ITensor of the given dimensions in UVM.

+
+ +
+
+static ITensorPtr ipcNvls( + +
+
std::set<int> ranks,
+
nvinfer1::Dims dims,
+
nvinfer1::DataType type,
+
+ +)#
+

Allocates an ITensor of the given dimensions for NVLS.

+
+ +
+
+

Public Static Attributes

+
+
+static auto constexpr kBYTE_TYPE = nvinfer1::DataType::kUINT8#
+
+ +
+
+

Private Members

+
+
+CudaStreamPtr mStream#
+
+ +
+
+CudaMemPoolPtr mPool#
+
+ +
+
+bool const mTrimPool#
+
+ +
+
+

Friends

+
+
+friend class ::BufferManagerTest
+
+ +
+
+ +
+ +
+ +
+
+

gptJsonConfig.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class GptJsonConfig#
+
+

Public Functions

+
+
+inline GptJsonConfig( + +
+
std::string name,
+
std::string version,
+
std::string precision,
+
SizeType32 tensorParallelism,
+
SizeType32 pipelineParallelism,
+
SizeType32 contextParallelism,
+
SizeType32 gpusPerNode,
+
ModelConfig modelConfig,
+
std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt,
+
+ +)#
+
+ +
+
+inline ModelConfig const &getModelConfig() const#
+
+ +
+
+inline ModelConfig &getModelConfigMutable()#
+
+ +
+
+inline std::string const &getName() const#
+
+ +
+
+inline std::string const &getVersion() const#
+
+ +
+
+inline std::string const &getPrecision() const#
+
+ +
+
+inline SizeType32 constexpr getTensorParallelism() const#
+
+ +
+
+inline SizeType32 constexpr getPipelineParallelism() const#
+
+ +
+
+inline SizeType32 constexpr getContextParallelism() const#
+
+ +
+
+inline SizeType32 constexpr getGpusPerNode() const#
+
+ +
+
+inline SizeType32 constexpr getWorldSize() const#
+
+ +
+
+inline std::optional<RuntimeDefaults> getRuntimeDefaults() const#
+
+ +
+
+std::string engineFilename( + +
+
WorldConfig const &worldConfig,
+
std::string const &model,
+
+ +) const#
+
+ +
+
+inline std::string engineFilename( + +
+
WorldConfig const &worldConfig,
+
+ +) const#
+
+ +
+
+

Public Static Functions

+
+
+static GptJsonConfig parse(std::string const &json)#
+
+ +
+
+static GptJsonConfig parse(std::istream &json)#
+
+ +
+
+static GptJsonConfig parse(std::filesystem::path const &path)#
+
+ +
+
+

Private Members

+
+
+std::string const mName#
+
+ +
+
+std::string const mVersion#
+
+ +
+
+std::string const mPrecision#
+
+ +
+
+SizeType32 const mTensorParallelism#
+
+ +
+
+SizeType32 const mPipelineParallelism#
+
+ +
+
+SizeType32 const mContextParallelism#
+
+ +
+
+SizeType32 const mGpusPerNode#
+
+ +
+
+ModelConfig mModelConfig#
+
+ +
+
+std::optional<RuntimeDefaults> mRuntimeDefaults#
+
+ +
+
+ +
+ +
+ +
+
+

runtimeDefaults.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+struct RuntimeDefaults#
+
+

Public Functions

+
+
+inline RuntimeDefaults( + +
+
std::optional<std::vector<SizeType32>> maxAttentionWindowVec,
+
std::optional<SizeType32> sinkTokenLength,
+
+ +)#
+
+ +
+
+RuntimeDefaults() = default#
+
+ +
+
+

Public Members

+
+
+std::optional<std::vector<SizeType32>> maxAttentionWindowVec#
+
+ +
+
+std::optional<SizeType32> sinkTokenLength#
+
+ +
@@ -5795,6 +5438,925 @@ one more than decoding draft tokens for prediction from primary head

+
+
+

rawEngine.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class RawEngine#
+
+

Public Types

+
+
+enum Type#
+

Values:

+
+
+enumerator FilePath#
+
+ +
+
+enumerator AddressWithSize#
+
+ +
+
+enumerator HostMemory#
+
+ +
+ +
+
+

Public Functions

+
+
+inline explicit RawEngine(std::filesystem::path enginePath) noexcept#
+
+ +
+
+inline explicit RawEngine( + +
+
void const *engineAddr,
+
std::size_t engineSize,
+
+ +) noexcept#
+
+ +
+
+inline explicit RawEngine( + +
+
nvinfer1::IHostMemory const *engineBuffer,
+
+ +) noexcept#
+
+ +
+
+inline Type getType() const#
+
+ +
+
+inline std::filesystem::path getPath() const#
+
+ +
+
+inline std::optional<std::filesystem::path> getPathOpt() const#
+
+ +
+
+inline void setPath(std::filesystem::path enginePath)#
+
+ +
+
+inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt( + +
+
+ +) const#
+
+ +
+
+inline void setManagedWeightsMap( + +
+
std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap,
+
+ +)#
+
+ +
+
+inline void const *getAddress() const#
+
+ +
+
+inline std::size_t getSize() const#
+
+ +
+
+inline nvinfer1::IHostMemory const *getHostMemory() const#
+
+ +
+
+

Public Members

+
+
+void const *mEngineAddr = {}#
+
+ +
+
+std::size_t mEngineSize = {}#
+
+ +
+
+

Private Members

+
+
+Type mType#
+
+ +
+
+std::optional<std::filesystem::path> mEnginePath#
+
+ +
+
+struct tensorrt_llm::runtime::RawEngine
+
+ +
+
+nvinfer1::IHostMemory const *mEngineBuffer = {}#
+
+ +
+
+std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> mManagedWeightsMap#
+
+ +
+
+ +
+ +
+ +
+
+

gptDecoder.h#

+
+
+namespace tensorrt_llm
+
+
+namespace layers#
+
+ +
+
+namespace runtime
+
+

Functions

+
+
+inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots( + +
+
runtime::SizeType32 batchSize,
+
+ +)#
+

Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.

+
+ +
+
+
+template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder#
+
+

Public Types

+
+
+using CudaStreamPtr = BufferManager::CudaStreamPtr#
+
+ +
+
+using TensorPtr = std::shared_ptr<ITensor>#
+
+ +
+
+

Public Functions

+
+
+GptDecoder( + +
+
executor::DecodingMode const &mode,
+
size_t maxNumSequences,
+
size_t maxBeamWidth,
+
size_t vocabSize,
+
size_t vocabSizePadded,
+
CudaStreamPtr const &stream,
+
std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr,
+
+ +)#
+
+ +
+
+virtual void setup( + +
+
SamplingConfig const &samplingConfig,
+
size_t batchSize,
+
TensorConstPtr const &batchSlots,
+
std::optional<DecodingOutput> const &output = std::nullopt,
+
std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+
std::optional<std::vector<TensorConstPtr>> const &lookaheadPrompt = std::nullopt,
+
std::optional<std::vector<executor::LookaheadDecodingConfig>> const &lookaheadAlgoConfigs = std::nullopt,
+
+ +) override#
+
+
Parameters:
+

explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.

+
+
+
+ +
+
+virtual void forwardAsync( + +
+
DecodingOutput &output,
+
DecodingInput const &input,
+
+ +) override#
+
+ +
+
+virtual void forwardSync( + +
+
DecodingOutput &output,
+
DecodingInput const &input,
+
+ +) override#
+
+ +
+
+inline virtual SamplingConfig const &getSamplingConfig() override#
+
+ +
+
+virtual void disableLookahead( + +
+
std::optional<SamplingConfig> const &samplingConfig,
+
SizeType32 batchSize,
+
TensorConstPtr batchSlots,
+
+ +) override#
+
+ +
+
+

Private Members

+
+
+std::shared_ptr<BufferManager> mManager#
+
+ +
+
+std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer#
+
+ +
+
+std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace#
+
+ +
+
+SamplingConfig mSamplingConfig#
+
+ +
+
+size_t mMaxNumSequences#
+
+ +
+
+size_t mVocabSize#
+
+ +
+
+size_t mVocabSizePadded#
+
+ +
+
+executor::DecodingMode mDecodingMode#
+
+ +
+
+ +
+
+class IGptDecoder#
+

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

+
+

Public Types

+
+
+using TensorPtr = runtime::ITensor::SharedPtr#
+
+ +
+
+using TensorConstPtr = runtime::ITensor::SharedConstPtr#
+
+ +
+
+

Public Functions

+
+
+virtual ~IGptDecoder() = default#
+
+ +
+
+virtual void setup( + +
+
SamplingConfig const &samplingConfig,
+
size_t batchSize,
+
TensorConstPtr const &batchSlots,
+
std::optional<DecodingOutput> const &output = std::nullopt,
+
std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+
std::optional<std::vector<TensorConstPtr>> const &lookaheadPrompt = std::nullopt,
+
std::optional<std::vector<executor::LookaheadDecodingConfig>> const &lookaheadAlgoConfigs = std::nullopt,
+
+ +) = 0#
+
+
Parameters:
+

explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.

+
+
+
+ +
+
+virtual void forwardAsync( + +
+
DecodingOutput &output,
+
DecodingInput const &input,
+
+ +) = 0#
+
+ +
+
+virtual void forwardSync( + +
+
DecodingOutput &output,
+
DecodingInput const &input,
+
+ +) = 0#
+
+ +
+
+virtual SamplingConfig const &getSamplingConfig() = 0#
+
+ +
+
+virtual void disableLookahead( + +
+
std::optional<SamplingConfig> const &samplingConfig,
+
SizeType32 batchSize,
+
TensorConstPtr batchSlots,
+
+ +) = 0#
+
+ +
+
+

Public Static Functions

+
+
+static inline std::unique_ptr<IGptDecoder> create( + +
+
executor::DecodingMode const &mode,
+
nvinfer1::DataType dtype,
+
size_t maxNumSequences,
+
size_t maxBeamWidth,
+
size_t vocabSize,
+
size_t vocabSizePadded,
+
BufferManager::CudaStreamPtr const &stream,
+
std::shared_ptr<SpeculativeDecodingModule const> const &speculativeDecodingModule = nullptr,
+
+ +)#
+
+ +
+
+ +
+ +
+ +
+
+

eagleBuffers.h#

+
+
+namespace tensorrt_llm
+
+
+namespace batch_manager
+
+ +
+
+namespace runtime
+
+
+class EagleBuffers#
+
+

Public Types

+
+
+using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
+
+ +
+
+using RequestVector = std::vector<LlmRequestPtr>#
+
+ +
+
+using SizeType32 = runtime::SizeType32#
+
+ +
+
+using ITensor = runtime::ITensor#
+
+ +
+
+using BufferPtr = runtime::IBuffer::SharedPtr#
+
+ +
+
+using TensorPtr = runtime::ITensor::SharedPtr#
+
+ +
+
+using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
+
+ +
+
+

Public Functions

+
+
+EagleBuffers( + +
+
SizeType32 maxBatchSize,
+
SizeType32 maxBeamWidth,
+
runtime::BufferManager const &manager,
+
runtime::ModelConfig const &modelConfig,
+
runtime::WorldConfig const &worldConfig,
+
executor::DecodingConfig const &decodingConfig,
+
+ +)#
+
+ +
+
+void reshape( + +
+
SizeType32 numCtxSequences,
+
SizeType32 numGenSequences,
+
runtime::ModelConfig const &modelConfig,
+
+ +)#
+
+ +
+
+void setFromInputs( + +
+
RequestVector const &contextRequests,
+
RequestVector const &genRequests,
+
runtime::ITensor const &requestTypes,
+
ITensor const &seqSlots,
+
EagleBuffers::Inputs const &decoderBuffers,
+
runtime::BufferManager const &manager,
+
runtime::ModelConfig const &modelConfig,
+
runtime::WorldConfig const &worldConfig,
+
+ +) const#
+
+ +
+
+void insertInputTensors( + +
+
TensorMap &inputBuffers,
+
TensorMap &outputBuffers,
+
runtime::WorldConfig const &worldConfig,
+
+ +) const#
+
+ +
+
+

Public Members

+
+
+Inputs engineInputs#
+
+ +
+
+class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs#
+
+ +
+
+

Private Functions

+
+
+template<typename T>
void setFromInputs( + +
+
RequestVector const &contextRequests,
+
RequestVector const &genRequests,
+
SizeType32 vocabSizePadded,
+
ITensor const &seqSlots,
+
EagleBuffers::Inputs const &draftBuffers,
+
runtime::EagleModule const &eagleModule,
+
runtime::BufferManager const &manager,
+
+ +) const#
+
+ +
+
+

Private Members

+
+
+std::size_t scanReduceTempStorageBytes = {0}#
+
+ +
+
+float mDefaultPosteriorThreshold = {0.09f}#
+
+ +
+
+bool mDoGreedySampling = {true}#
+
+ +
+
+BufferPtr scanReduceTempStorage#
+
+ +
+
+TensorPtr cumSumGenerationLengths#
+
+ +
+
+TensorPtr maxGenerationLength#
+
+ +
+
+TensorPtr chunkedContextNextTokensHost#
+
+ +
+
+TensorPtr greedySamplingHost#
+
+ +
+
+TensorPtr posteriorAlphaHost#
+
+ +
+
+TensorPtr posteriorThresholdHost#
+
+ +
+
+
+class EngineOutputs#
+
+

Public Members

+
+
+TensorPtr nextDraftTokens#
+

[batchSize, maxDecodingDraftTokens]

+
+ +
+
+TensorPtr nextDraftLens#
+

[batchSize]

+
+ +
+
+TensorPtr nextDraftPaths#
+

[batchSize, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr acceptedTokens#
+

[batchSize, maxPathLen]

+
+ +
+
+TensorPtr acceptedLens#
+

[batchSize]

+
+ +
+
+TensorPtr acceptedPaths#
+

[batchSize]

+
+ +
+
+TensorPtr chunkedContextNextTokens#
+

[batchSize]

+
+ +
+
+ +
+
+class Inputs#
+
+

Public Functions

+
+
+void create( + +
+
SizeType32 maxNumSequences,
+
BufferManager const &manager,
+
ModelConfig const &modelConfig,
+
WorldConfig const &worldConfig,
+
+ +)#
+
+ +
+
+

Public Members

+
+
+TensorPtr temperatures#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr posteriorAlpha#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr posteriorThreshold#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr randomDataSample#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr randomDataValidation#
+

[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]

+
+ +
+
+TensorPtr draftTokens#
+

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

+
+ +
+
+TensorPtr draftLens#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr draftPaths#
+

[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr draftPathsHost#
+

[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr specDecodingGenerationLengths#
+

[maxBatchSize] or [numGenSequences]

+
+ +
+
+TensorPtr specDecodingGenerationLengthsHost#
+

[maxBatchSize] or [numGenSequences]

+
+ +
+
+TensorPtr specDecodingPackedMasks#
+

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

+
+ +
+
+TensorPtr specDecodingPositionOffsets#
+

[maxBatchSize] or [numGenSequences]

+
+ +
+
+TensorPtr eagleNetCtxRequestTypesHost#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr eagleNetCtxContextLengthsHost#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr eagleNetCtxPastKeyValueLengthsHost#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr eagleNetGenRequestTypesHost#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr eagleNetGenContextLengthsHost#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr eagleNetGenPastKeyValueLengthsHost#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr inputGenTokensHost#
+

[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]

+
+ +
+
+TensorPtr chunkedContextNextTokens#
+

[maxBatchSize] or [numSequences]

+
+ +
+
+TensorPtr useSpecDecoding#
+

[1]

+
+ +
+
+TensorPtr useDynamicTreeHost#
+

[1]

+
+ +
+
+TensorPtr dynamicTreeMaxTopKHost#
+

[1]

+
+ +
+
+TensorPtr prevScores#
+

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

+
+ +
+
+TensorPtr currentExpandIndices#
+

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

+
+ +
+
+TensorPtr allLayersScores#
+

[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]

+
+ +
+
+TensorPtr allLayersDraftTokenIds#
+

[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]

+
+ +
+
+TensorPtr allLayersDraftTokenIdsPredecessor#
+

[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]

+
+ +
+
+ +
+ +
+ +
+

medusaModule.h#

@@ -5859,6 +6421,5223 @@ one more than decoding draft tokens for prediction from primary head

+
+
+

virtualMemory.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+

Functions

+
+
+CudaVirtualMemoryManager &getVirtualMemoryManager()#
+
+ +
+
+CudaVirtualMemoryAllocator getVirtualMemoryAllocator()#
+
+ +
+
+void setVirtualMemoryAllocator( + +
+
std::string const &tag,
+
CudaVirtualMemoryAllocator::RestoreMode mode,
+
std::shared_ptr<CudaStream> backStream,
+
+ +)#
+
+ +
+
+void clearVirtualMemoryAllocator()#
+
+ +
+
+
+class CudaVirtualMemoryAllocator#
+
+

Public Types

+
+
+enum RestoreMode#
+

Values:

+
+
+enumerator NONE#
+
+ +
+
+enumerator MEMSET#
+
+ +
+
+enumerator CPU#
+
+ +
+
+enumerator PINNED#
+
+ +
+ +
+
+

Public Functions

+
+
+inline explicit CudaVirtualMemoryAllocator( + +
+
std::shared_ptr<Configuration> config,
+
+ +)#
+
+ +
+
+inline explicit operator bool() const noexcept#
+
+ +
+
+void allocate(Pointer *ptr, std::size_t n, int device) const#
+
+ +
+
+void deallocate(Pointer ptr, std::size_t n) const#
+
+ +
+
+

Private Types

+
+
+using CudaStreamPtr = std::shared_ptr<CudaStream>#
+
+ +
+
+using Pointer = void*#
+
+ +
+
+

Private Members

+
+
+std::shared_ptr<Configuration> mConfig#
+
+ +
+
+
+class Configuration#
+
+

Public Functions

+
+
+inline Configuration( + +
+
CudaVirtualMemoryManager &manager,
+
std::string tag,
+
RestoreMode mode,
+
CudaStreamPtr backStream,
+
+ +)#
+

CudaVirtualMemoryAllocator::Configuration

+
+
Parameters:
+
    +
  • manager – Manager used to track and manage virtual memories

  • +
  • tag – The tag for allocated memories

  • +
  • mode – Backed storage mode

  • +
  • backStream – The CUDA stream used for restoring memory content Note: Virtual Address Allocation is not async. The stream is not used in allocation.

  • +
+
+
+
+ +
+
+inline std::size_t pageAligned(std::size_t n) const noexcept#
+
+ +
+
+

Public Static Attributes

+
+
+static Configuration backgroundConfiguration#
+
+ +
+
+

Private Functions

+
+
+inline Configuration( + +
+
CudaVirtualMemoryManager &manager,
+
std::string tag,
+
RestoreMode mode,
+
CudaStreamPtr backStream,
+
bool background,
+
+ +)#
+
+ +
+
+

Private Members

+
+
+CudaVirtualMemoryManager &mManager#
+
+ +
+
+std::string mTag#
+
+ +
+
+CudaStreamPtr mBackStream#
+
+ +
+
+std::size_t mPageSize#
+
+ +
+
+RestoreMode mMode#
+
+ +
+
+bool mBackground = {}#
+
+ +
+
+

Friends

+
+
+friend class CudaVirtualMemoryAllocator
+
+ +
+
+friend void setVirtualMemoryAllocator( + +
+
std::string const &tag,
+
RestoreMode mode,
+
std::shared_ptr<CudaStream> backStream,
+
+ +)#
+
+ +
+
+ +
+ +
+
+class CUDAVirtualMemoryChunk#
+
+#include <virtualMemory.h>
+

CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation, providing the ability to release and rematerialize the allocation.

+
+

Public Types

+
+
+enum Status#
+

Values:

+
+
+enumerator INVALID#
+
+ +
+
+enumerator RELEASED#
+
+ +
+
+enumerator MATERIALIZED#
+
+ +
+
+enumerator ERRORED#
+
+ +
+ +
+
+using CreatorPtr = std::unique_ptr<Creator>#
+
+ +
+
+using ConfiguratorPtr = std::unique_ptr<Configurator>#
+
+ +
+
+using Configurators = std::vector<ConfiguratorPtr>#
+
+ +
+
+

Public Functions

+
+
+inline Status status() const noexcept#
+
+ +
+
+void materialize()#
+

Materialize this CUDAVirtualMemoryChunk. Shall be called only when status() == RELEASED.

+

Calls creator.create(), and then configurator.setup() for each configurator in order.

+

Stop at the first thrown exception and propagates it.

+
+ +
+
+inline void release()#
+

Release this CUDAVirtualMemoryChunk. Shall be called only when status() == MATERIALIZED, or materialize() throws. Will be called automatically by destructor if necessary.

+

Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order, and then creator.release().

+

Never stops early upon exception. The last thrown exception will be propagated, and others logged.

+
+ +
+
+CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk const&) = delete#
+
+ +
+
+CUDAVirtualMemoryChunk &operator=( + +
+
CUDAVirtualMemoryChunk const&,
+
+ +) = delete#
+
+ +
+
+inline CUDAVirtualMemoryChunk( + +
+
CUDAVirtualMemoryChunk &&other,
+
+ +) noexcept#
+
+ +
+
+inline CUDAVirtualMemoryChunk &operator=( + +
+
CUDAVirtualMemoryChunk &&other,
+
+ +)#
+
+ +
+
+CUDAVirtualMemoryChunk() noexcept = default#
+
+ +
+
+inline CUDAVirtualMemoryChunk( + +
+
CreatorPtr &&creator,
+
Configurators &&configurators,
+
+ +)#
+
+ +
+
+inline virtual ~CUDAVirtualMemoryChunk()#
+
+ +
+
+inline explicit operator bool() const noexcept#
+

Test if this CUDAVirtualMemoryChunk is managing a memory block.

+
+ +
+
+

Private Functions

+
+
+void _release(bool destructing)#
+
+ +
+
+

Private Members

+
+
+size_t mState = 0#
+
+ +
+
+CUmemGenericAllocationHandle mHandle = {}#
+
+ +
+
+std::unique_ptr<Creator> mCreator#
+
+ +
+
+std::vector<std::unique_ptr<Configurator>> mConfigurators#
+
+ +
+
+

Private Static Attributes

+
+
+static constexpr size_t INVALID_STATE = static_cast<size_t>(-1)#
+
+ +
+
+
+struct Configurator#
+
+#include <virtualMemory.h>
+

CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle:

    +
  • Map into virtual address

  • +
  • Bind to multicast object

  • +
  • Backup and restore memory content

  • +
+

+

Subclassed by tensorrt_llm::runtime::MemsetConfigurator, tensorrt_llm::runtime::MulticastConfigurator, tensorrt_llm::runtime::OffloadConfigurator, tensorrt_llm::runtime::UnicastConfigurator

+
+

Public Functions

+
+
+Configurator() = default#
+
+ +
+
+virtual ~Configurator() = default#
+
+ +
+
+Configurator(Configurator const&) = default#
+
+ +
+
+Configurator &operator=(Configurator const&) = default#
+
+ +
+
+Configurator(Configurator&&) = default#
+
+ +
+
+Configurator &operator=(Configurator&&) = default#
+
+ +
+
+virtual void setup(CUmemGenericAllocationHandle handle) = 0#
+
+ +
+
+virtual void teardown( + +
+
CUmemGenericAllocationHandle handle,
+
bool destructing,
+
+ +) = 0#
+
+ +
+
+ +
+
+struct Creator#
+
+#include <virtualMemory.h>
+

CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle, either by creating one locally, or importing one from remote.

+

Subclassed by tensorrt_llm::runtime::LocalCreator< count >

+
+

Public Functions

+
+
+Creator() = default#
+
+ +
+
+virtual ~Creator() = default#
+
+ +
+
+Creator(Creator const&) = default#
+
+ +
+
+Creator &operator=(Creator const&) = default#
+
+ +
+
+Creator(Creator&&) = default#
+
+ +
+
+Creator &operator=(Creator&&) = default#
+
+ +
+
+virtual CUmemGenericAllocationHandle create() = 0#
+
+ +
+
+virtual void release( + +
+
CUmemGenericAllocationHandle handle,
+
bool destructing,
+
+ +) = 0#
+
+ +
+
+ +
+ +
+
+class CudaVirtualMemoryManager#
+
+

Public Functions

+
+
+void add( + +
+
uintptr_t handle,
+
std::string tag,
+
CUDAVirtualMemoryChunk &&memory,
+
+ +)#
+

Add memory to be managed by this manager.

+

The memory and internal state will remain valid if any exception is thrown.

+

+
+
Parameters:
+
    +
  • handle – Unique handle provided to reference this memory in remove.

  • +
  • tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.

  • +
  • memory – The CUDAVirtualMemory object.

  • +
+
+
+
+ +
+
+void add( + +
+
uintptr_t handle,
+
std::string tag,
+
CUDAVirtualMemoryChunk::CreatorPtr &&creator,
+
CUDAVirtualMemoryChunk::Configurators &&configurators,
+
+ +)#
+

Creates and adds memory to be managed by this manager. The created memory is automatically materialized.

+

The internal state will remain valid if any exception is thrown.

+

+
+
Parameters:
+
    +
  • handle – Unique handle provided to reference this memory in remove.

  • +
  • tag – Tag the memory, so this memory can be targeted in releaseWithTag and materializeWithTag.

  • +
  • creator – The creator for the memory.

  • +
  • configurators – The configurators for the memory.

  • +
+
+
+
+ +
+
+template<typename ...Configurators>
inline void add( + +
+
uintptr_t handle,
+
std::string tag,
+
CUDAVirtualMemoryChunk::CreatorPtr &&creator,
+
Configurators&&... configurators,
+
+ +)#
+
+ +
+
+CUDAVirtualMemoryChunk remove(uintptr_t handle) noexcept#
+

Remove the memory from the manager.

+
+
Parameters:
+

handle – The handle provided to add.

+
+
Returns:
+

The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned.

+
+
+
+ +
+
+size_t releaseWithTag(std::string const &tag)#
+

Call release for CUDAVirtualMemoryChunk

objects with a given tag.

+

This function will always call

+CUDAVirtualMemoryChunk::release on all selected objects. The last exception thrown by CUDAVirtualMemoryChunk::release will be rethrown, and others will be logged.

+

If any CUDAVirtualMemoryChunk threw an exception during release, it will be removed from the manager. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.

+
+
Parameters:
+

tag – the tag to select target memories.

+
+
Returns:
+

Number of objects selected.

+
+
+
+ +
+
+size_t materializeWithTag(std::string const &tag)#
+

Call materialize for CUDAVirtualMemoryChunk

objects with a given tag.

+

This function will stop at the first

+CUDAVirtualMemoryChunk::materialize that throws exception, and attempt to roll back previous successful materialize by calling release. The exception thrown by CUDAVirtualMemoryChunk::materialize will be rethrown, and any exception thrown by release will be logged.

+

If any CUDAVirtualMemoryChunk threw an exception during materialize or release, it will be removed from the manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed. Call retrieveBadHandles to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.

+
+
Parameters:
+

tag – the tag to select target memories.

+
+
Returns:
+

Number of objects selected.

+
+
+
+ +
+
+std::vector<uintptr_t> retrieveBadHandles() noexcept#
+

Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list. The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened. This method is only for diagnostic purpose, and should not be called concurrently with other methods.

+
+
Returns:
+

The handle list.

+
+
+
+ +
+
+

Private Types

+
+
+using PointerMemoryMap = std::map<uintptr_t, Entry>#
+
+ +
+
+using TagEntryMap = std::multimap<std::string, PointerMemoryMap::iterator>#
+
+ +
+
+

Private Functions

+
+
+CUDAVirtualMemoryChunk unsafeRemove(uintptr_t handle) noexcept#
+
+ +
+
+void addBadHandle(uintptr_t handle) noexcept#
+
+ +
+
+

Private Members

+
+
+std::mutex mMutex#
+
+ +
+
+PointerMemoryMap mMemories#
+
+ +
+
+TagEntryMap mEntries#
+
+ +
+
+std::vector<uintptr_t> mBadHandles#
+
+ +
+
+friend VirtualMemoryManagerTest
+
+ +
+
+
+struct Entry#
+
+

Public Members

+
+
+CUDAVirtualMemoryChunk mMemory#
+
+ +
+
+TagEntryMap::iterator mEntryIt#
+
+ +
+
+ +
+ +
+
+template<bool count = true>
struct LocalCreator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Creator#
+
+#include <virtualMemory.h>
+

LocalCreator creates memory allocation locally through cuMemCreate.

+
+

Public Functions

+
+
+inline LocalCreator(CUmemAllocationProp const &prop, size_t size)#
+
+ +
+
+inline virtual CUmemGenericAllocationHandle create() override#
+
+ +
+
+inline virtual void release( + +
+
CUmemGenericAllocationHandle handle,
+
bool destructing,
+
+ +) override#
+
+ +
+
+

Public Members

+
+
+CUmemAllocationProp mProp = {}#
+
+ +
+
+size_t mSize = {}#
+
+ +
+
+ +
+
+struct MemsetConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
+
+#include <virtualMemory.h>
+

MemsetConfigurator fills the memory with given value.

+
+

Public Functions

+
+
+inline MemsetConfigurator( + +
+
CUdeviceptr address,
+
size_t size,
+
uint8_t value,
+
CUstream stream,
+
+ +)#
+
+ +
+
+inline virtual void setup(CUmemGenericAllocationHandle) override#
+
+ +
+
+inline virtual void teardown( + +
+
CUmemGenericAllocationHandle,
+
bool,
+
+ +) noexcept override#
+
+ +
+
+

Public Members

+
+
+CUdeviceptr mAddress#
+
+ +
+
+size_t mSize#
+
+ +
+
+CUstream mStream = {}#
+
+ +
+
+uint8_t mValue#
+
+ +
+
+bool mFirstTime = true#
+
+ +
+
+ +
+
+struct MulticastConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
+
+#include <virtualMemory.h>
+

MulticastConfigurator binds the allocation handle to the given multicast object and offset.

+
+

Public Functions

+
+
+inline virtual void setup( + +
+
CUmemGenericAllocationHandle handle,
+
+ +) override#
+
+ +
+
+inline virtual void teardown( + +
+
CUmemGenericAllocationHandle,
+
bool,
+
+ +) override#
+
+ +
+
+

Public Members

+
+
+CUmemGenericAllocationHandle mMulticast#
+
+ +
+
+size_t mBindOffset#
+
+ +
+
+CUdevice mDevice#
+
+ +
+
+size_t mSize#
+
+ +
+
+ +
+
+struct OffloadConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
+
+#include <virtualMemory.h>
+

OffloadConfigurator offload the content of the allocation to the backup storage when teardown, and restore the content on the following setup.

+
+

Public Functions

+
+
+inline OffloadConfigurator( + +
+
CUdeviceptr address,
+
size_t size,
+
MemoryType backType,
+
CUstream stream,
+
bool ondemand = false,
+
+ +)#
+
+ +
+
+virtual void setup(CUmemGenericAllocationHandle handle) override#
+
+ +
+
+virtual void teardown( + +
+
CUmemGenericAllocationHandle handle,
+
bool destructing,
+
+ +) override#
+
+ +
+
+

Public Members

+
+
+CUdeviceptr mAddress#
+
+ +
+
+size_t mSize#
+
+ +
+
+MemoryType mBackType#
+
+ +
+
+CUstream mStream#
+
+ +
+
+bool mOndemand#
+
+ +
+
+IBuffer::UniquePtr mBackedStorage#
+
+ +
+
+ +
+
+struct UnicastConfigurator : public tensorrt_llm::runtime::CUDAVirtualMemoryChunk::Configurator#
+
+#include <virtualMemory.h>
+

UnicastConfigurator maps the allocation handle into the specified unicast address range.

+
+

Public Functions

+
+
+inline UnicastConfigurator( + +
+
CUdeviceptr address,
+
size_t size,
+
CUmemAccessDesc const &desc,
+
+ +)#
+
+ +
+
+inline virtual void setup( + +
+
CUmemGenericAllocationHandle handle,
+
+ +) override#
+
+ +
+
+inline virtual void teardown( + +
+
CUmemGenericAllocationHandle,
+
bool,
+
+ +) override#
+
+ +
+
+

Public Members

+
+
+CUdeviceptr mAddress#
+
+ +
+
+size_t mSize#
+
+ +
+
+CUmemAccessDesc mDesc#
+
+ +
+
+ +
+ +
+ +
+
+

explicitDraftTokensBuffers.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class ExplicitDraftTokensBuffers#
+
+

Public Types

+
+
+using SizeType32 = runtime::SizeType32#
+
+ +
+
+using ITensor = runtime::ITensor#
+
+ +
+
+using BufferPtr = runtime::IBuffer::SharedPtr#
+
+ +
+
+using TensorPtr = runtime::ITensor::SharedPtr#
+
+ +
+
+using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
+
+ +
+
+

Public Functions

+
+
+ExplicitDraftTokensBuffers( + +
+
SizeType32 maxBatchSize,
+
SizeType32 maxBeamWidth,
+
runtime::BufferManager const &manager,
+
runtime::ModelConfig const &modelConfig,
+
runtime::WorldConfig const &worldConfig,
+
+ +)#
+
+ +
+
+void reshape( + +
+
SizeType32 numCtxSequences,
+
SizeType32 numGenSequences,
+
runtime::ModelConfig const &modelConfig,
+
+ +)#
+
+ +
+
+void setFromInputs( + +
+
SizeType32 numCtxSequences,
+
SizeType32 numGenSequences,
+
runtime::ITensor const &requestTypes,
+
ITensor const &seqSlots,
+
ExplicitDraftTokensBuffers::Inputs const &decoderBuffers,
+
ITensor const &contextPositionIds,
+
runtime::ModelConfig const &modelConfig,
+
runtime::WorldConfig const &worldConfig,
+
runtime::BufferManager const &manager,
+
runtime::CudaStream const &stream,
+
+ +) const#
+
+ +
+
+void insertInputTensors( + +
+
TensorMap &inputBuffers,
+
TensorMap &outputBuffers,
+
runtime::WorldConfig const &worldConfig,
+
+ +) const#
+
+ +
+
+

Public Members

+
+
+tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs#
+
+ +
+
+class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs#
+
+ +
+
+std::size_t scanTempStorageBytes = {0}#
+
+ +
+
+BufferPtr scanTempStorage#
+
+ +
+
+TensorPtr cumSumGenerationLengths#
+
+ +
+
+

Private Functions

+
+
+template<typename T>
void setFromInputs( + +
+
SizeType32 numCtxSequences,
+
SizeType32 numGenSequences,
+
SizeType32 vocabSizePadded,
+
ITensor const &seqSlots,
+
ExplicitDraftTokensBuffers::Inputs const &draftBuffers,
+
ITensor const &contextPositionIds,
+
runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule,
+
runtime::CudaStream const &stream,
+
+ +) const#
+
+ +
+
+
+class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs#
+
+

Public Members

+
+
+TensorPtr requestTypesDevice#
+

[numSequences], on gpu

+
+ +
+
+TensorPtr positionOffsets#
+

[numGenSequences]

+
+ +
+
+ +
+
+class EngineOutputs#
+
+

Public Members

+
+
+TensorPtr nextGenerationLengths#
+

[batchSize]

+
+ +
+
+TensorPtr nextPositionOffsets#
+

[batchSize]

+
+ +
+
+TensorPtr masks#
+

[batchSize, maxDecodingTokens, maxDecodingTokens], bool

+
+ +
+
+TensorPtr nextDraftTokens#
+

[batchSize, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr nextDraftIndices#
+

[batchSize, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr nextDraftProbs#
+

[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

+
+ +
+
+TensorPtr nextFlatTokens#
+

[batchSize * maxDecodingTokens]

+
+ +
+
+TensorPtr bestPathLengths#
+

[batchSize]

+
+ +
+
+TensorPtr bestPathIndices#
+

[batchSize]

+
+ +
+
+TensorPtr maxGenToken#
+

[1]

+
+ +
+
+TensorPtr totalGenToken#
+

[1]

+
+ +
+
+TensorPtr packedPositionIds#
+

[batchSize * maxDecodingTokens]

+
+ +
+
+ +
+
+class Inputs#
+

Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs

+
+

Public Functions

+
+
+void create( + +
+
SizeType32 maxNumSequences,
+
runtime::BufferManager const &manager,
+
runtime::ModelConfig const &modelConfig,
+
runtime::WorldConfig const &worldConfig,
+
+ +)#
+
+ +
+
+

Public Members

+
+
+TensorPtr temperatures#
+

[maxBatchSize]

+
+ +
+
+TensorPtr positionIdsBase#
+

[maxBatchSize]

+
+ +
+
+TensorPtr generationLengths#
+

[maxBatchSize] or [numGenSequences]

+
+ +
+
+TensorPtr randomDataSample#
+

[maxBatchSize]

+
+ +
+
+TensorPtr randomDataValidation#
+

[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]

+
+ +
+
+TensorPtr draftTokens#
+

[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr draftIndices#
+

[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

+
+ +
+
+TensorPtr draftProbs#
+

[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]

+
+ +
+
+TensorPtr packedMasks#
+

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

+
+ +
+
+TensorPtr positionIds#
+

[maxBatchSize] or [numGenSequences]

+
+ +
+
+TensorPtr maxGenLengthHost#
+
+ +
+
+TensorPtr generationLengthsHost#
+
+ +
+
+TensorPtr useSpecDecoding#
+
+ +
+
+ +
+ +
+ +
+ +
+
+

iTensor.h#

+
+
+namespace nvinfer1#
+
+ +
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+

Functions

+
+
+inline std::ostream &operator<<( + +
+
std::ostream &output,
+
ITensor::Shape const &dims,
+
+ +)#
+

Utility function to print a shape.

+
+ +
+
+std::ostream &operator<<( + +
+
std::ostream &output,
+
ITensor const &tensor,
+
+ +)#
+

Utility function to print a tensor with its shape.

+
+ +
+
+template<typename T>
T const *bufferCastOrNull( + +
+
ITensor::SharedConstPtr const &tensorPtr,
+
+ +)#
+

Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensorPtr, or nullptr if the tensorPtr is null.

+

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

+
+
Template Parameters:
+

T – The type of the underlying data.

+
+
Parameters:
+

tensorPtr – A possibly null shared ptr.

+
+
Returns:
+

A pointer to T const, possibly nullptr.

+
+
+
+ +
+
+template<typename T>
T *bufferCastOrNull( + +
+
ITensor::SharedPtr const &tensorPtr,
+
+ +)#
+

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the tensorPtr, or nullptr if the tensorPtr is null.

+

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

+
+
Template Parameters:
+

T – The type of the underlying data.

+
+
Parameters:
+

tensorPtr – A possibly null shared ptr.

+
+
Returns:
+

A pointer to T, possibly nullptr.

+
+
+
+ +
+
+template<typename T>
T *bufferCastOrNull( + +
+
std::optional<ITensor::SharedPtr> const &optionalTensorPtr,
+
+ +)#
+

Retrieves a T typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

+

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

+
+
Template Parameters:
+

T – The type of the underlying data.

+
+
Parameters:
+

optionalBufferPtr – A possibly empty optional.

+
+
Returns:
+

A pointer to T, possibly nullptr.

+
+
+
+ +
+
+template<typename T>
T const *bufferCastOrNull( + +
+
std::optional<ITensor::SharedConstPtr> const &optionalTensorPtr,
+
+ +)#
+

Retrieves a T const typed pointer to the underlying data of the tensor pointed to by the tensor pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

+

This overload has to be declared to avoid ambiguity when an implicit conversion to IBuffer is involved.

+
+
Template Parameters:
+

T – The type of the underlying data.

+
+
Parameters:
+

optionalBufferPtr – A possibly empty optional.

+
+
Returns:
+

A pointer to const T, possibly nullptr.

+
+
+
+ +
+
+
+class ITensor : public virtual tensorrt_llm::runtime::IBuffer#
+
+

Public Types

+
+
+using UniquePtr = std::unique_ptr<ITensor>#
+
+ +
+
+using SharedPtr = std::shared_ptr<ITensor>#
+
+ +
+
+using UniqueConstPtr = std::unique_ptr<ITensor const>#
+
+ +
+
+using SharedConstPtr = std::shared_ptr<ITensor const>#
+
+ +
+
+using Shape = nvinfer1::Dims#
+
+ +
+
+using DimType64 = std::remove_reference_t<decltype(Shape::d[0])>#
+
+ +
+
+using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
+
+ +
+
+

Public Functions

+
+
+~ITensor() override = default#
+
+ +
+
+virtual Shape const &getShape() const = 0#
+

Returns the tensor dimensions.

+
+ +
+
+template<SizeType32 n>
inline DimType64 getDimension() const#
+

Returns the tensor n-th dimension. If n is negative, returns the (nbDims - n)th dimension. TODO: replace with constexpr parameter when moving to C++20.

+
+ +
+
+virtual void reshape(Shape const &dims) = 0#
+

Sets the tensor dimensions. The new size of the tensor will be volume(dims)

+
+ +
+
+inline virtual void resize(std::size_t newSize) override#
+

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

+
+ +
+
+ITensor(ITensor const&) = delete#
+

Not allowed to copy.

+
+ +
+
+ITensor &operator=(ITensor const&) = delete#
+

Not allowed to copy.

+
+ +
+
+inline void squeeze(SizeType32 dim)#
+

Removes the given unit dimensions from this tensor.

+
+ +
+
+inline void unsqueeze(SizeType32 dim)#
+

Adds a unit dimension at the specified position.

+
+ +
+
+inline bool shapeEquals(Shape const &other) const#
+
+ +
+
+inline bool shapeEquals( + +
+
std::initializer_list<SizeType32> const &other,
+
+ +) const#
+
+ +
+
+template<typename T>
inline bool shapeEquals( + +
+
T const *dims,
+
SizeType32 count,
+
+ +) const#
+
+ +
+
+

Public Static Functions

+
+
+static inline std::int64_t volume(Shape const &dims)#
+

Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

+
+ +
+
+static inline std::size_t volumeNonNegative(Shape const &shape)#
+

Returns the volume of the dimensions. Throws if d.nbDims < 0.

+
+ +
+
+static inline Shape strides(Shape const &dims)#
+

Returns the strides of each dimemsion in a Shape.

+
+ +
+
+static Shape squeeze(Shape const &shape, SizeType32 dim)#
+

Removes the given unit dimension from shape.

+
+
Parameters:
+
    +
  • shape – The shape to squeeze.

  • +
  • dim – The dimension that should be removed (“squeezed”).

  • +
+
+
Returns:
+

A new shape without the unit dimension.

+
+
+
+ +
+
+static Shape unsqueeze(Shape const &shape, SizeType32 dim)#
+

Add a unit dimension to shape at the specified position.

+
+
Parameters:
+
    +
  • shape – The shape to unsqueeze.

  • +
  • dim – The dimension where unit dimension should be added.

  • +
+
+
Returns:
+

A new shape with the added unit dimension.

+
+
+
+ +
+
+static UniquePtr slice( + +
+
SharedPtr tensor,
+
std::size_t offset,
+
std::size_t size,
+
+ +)#
+

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

+
+
Parameters:
+
    +
  • tensor – The tensor to view.

  • +
  • offset – The offset of the view w.r.t. dimension 0 of the tensor.

  • +
  • size – The size of the view w.r.t. dimension 0 of the tensor.

  • +
+
+
Returns:
+

A view on the buffer.

+
+
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( + +
+
TConstPtr &&tensor,
+
std::size_t offset,
+
std::size_t size,
+
+ +)#
+
+ +
+
+static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( + +
+
TConstPtr &&tensor,
+
std::size_t offset,
+
+ +)#
+
+ +
+
+static UniquePtr slice( + +
+
SharedPtr tensor,
+
Shape const &offsetDims,
+
DimType64 size,
+
+ +)#
+
+
Parameters:
+
    +
  • offsetDims – The offset in multiple dimensions.

  • +
  • tensor – The tensor to view.

  • +
  • offsetDims – The offset dimensions of the view.

  • +
  • size – The size of the view w.r.t. the last dimension in offsetDims.

  • +
  • offsetDims – specifies all dimensions.

  • +
+
+
Throws:
+

Whenever – offset overflows or the last dimension offset+size overflows.

+
+
Returns:
+

A view of shape [size, the rest dimensions] or [size] when

+
+
+
+ +
+
+static inline UniquePtr slice( + +
+
SharedPtr tensor,
+
std::initializer_list<DimType64> const &offsetDims,
+
DimType64 size,
+
+ +)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( + +
+
TConstPtr &&tensor,
+
Shape const &offsetDims,
+
std::size_t size,
+
+ +)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( + +
+
TConstPtr &&tensor,
+
std::initializer_list<DimType64> const &offsetDims,
+
std::size_t size,
+
+ +)#
+
+ +
+
+static inline UniquePtr slice( + +
+
SharedPtr tensor,
+
Shape const &offsetDims,
+
+ +)#
+

return the rest slices at the last dimension when size omitted.

+
+ +
+
+static inline UniquePtr slice( + +
+
SharedPtr tensor,
+
std::initializer_list<DimType64> const &offsetDims,
+
+ +)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( + +
+
TConstPtr &&tensor,
+
Shape const &offsetDims,
+
+ +)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( + +
+
TConstPtr &&tensor,
+
std::initializer_list<DimType64> const &offsetDims,
+
+ +)#
+
+ +
+
+static inline UniquePtr at(SharedPtr tensor, Shape const &offsetDims)#
+
+
Parameters:
+

offsetDims – specifies all dimensions.

+
+
Returns:
+

Just the block at the point, with shape of [the rest dimensions] or [1] when

+
+
+
+ +
+
+static inline UniquePtr at( + +
+
SharedPtr tensor,
+
std::initializer_list<DimType64> const &offsetDims,
+
+ +)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr at( + +
+
TConstPtr &&tensor,
+
Shape const &offsetDims,
+
+ +)#
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline ITensor::UniqueConstPtr at( + +
+
TConstPtr &&tensor,
+
std::initializer_list<DimType64> const &offsetDims,
+
+ +)#
+
+ +
+
+static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)#
+

Returns a view on the underlying buffer (or tensor) with the given shape.

+
+
Parameters:
+
    +
  • tensor – The tensor to view.

  • +
  • shape – The shape of the view.

  • +
+
+
Returns:
+

A view on the tensor.

+
+
+
+ +
+
+template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view( + +
+
TConstPtr &&tensor,
+
Shape const &dims,
+
+ +)#
+
+ +
+
+static inline UniquePtr view(SharedPtr tensor)#
+

Returns a view on the underlying tensor which can be independently reshaped.

+
+
Parameters:
+

tensor – The tensor to view.

+
+
Returns:
+

A view on the tensor.

+
+
+
+ +
+
+static inline UniquePtr flattenN( + +
+
SharedPtr tensor,
+
std::int64_t sliceN = -1,
+
+ +)#
+

Returns a flattened view on the underlying tensor which can be independently reshaped.

+
+
Parameters:
+
    +
  • tensor – The tensor to flatten.

  • +
  • sliceN – Slice the first N elements after flattening. -1 means take the whole flattened tensor.

  • +
+
+
Returns:
+

A flatten view on the tensor.

+
+
+
+ +
+
+static UniquePtr wrap( + +
+
void *data,
+
nvinfer1::DataType type,
+
Shape const &shape,
+
std::size_t capacity,
+
+ +)#
+

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

+
+
Parameters:
+
    +
  • data – The data to wrap.

  • +
  • type – The data type of the data.

  • +
  • shape – The shape of the tensor.

  • +
  • capacity – The capacity of the buffer.

  • +
+
+
Returns:
+

An ITensor.

+
+
+
+ +
+
+static inline UniquePtr wrap( + +
+
void *data,
+
nvinfer1::DataType type,
+
Shape const &shape,
+
+ +)#
+
+ +
+
+template<typename T>
static inline UniquePtr wrap( + +
+
T *data,
+
Shape const &shape,
+
std::size_t capacity,
+
+ +)#
+
+ +
+
+template<typename T>
static inline UniquePtr wrap( + +
+
T *data,
+
Shape const &shape,
+
+ +)#
+
+ +
+
+template<typename T>
static inline UniquePtr wrap( + +
+
std::vector<T> &v,
+
Shape const &shape,
+
+ +)#
+
+ +
+
+static Shape makeShape( + +
+
std::initializer_list<DimType64> const &dims,
+
+ +)#
+

A convenience function to create a tensor shape with the given dimensions.

+
+ +
+
+static std::string toString(Shape const &dims)#
+

A convenience function for converting a tensor shape to a string.

+
+ +
+
+static inline bool shapeEquals(Shape const &lhs, Shape const &rhs)#
+

A convenience function to compare shapes.

+
+ +
+
+template<typename T>
static inline bool shapeEquals( + +
+
Shape const &lhs,
+
T const *dims,
+
SizeType32 count,
+
+ +)#
+

A convenience function to compare shapes.

+
+ +
+
+

Protected Functions

+
+
+ITensor() = default#
+
+ +
+
+

Protected Static Functions

+
+
+static inline DimType64 castSize(size_t newSize)#
+
+ +
+
+

Friends

+
+
+friend class ITensorBindings
+
+ +
+
+ +
+ +
+ +
+
+

common.h#

+
+

Defines

+
+
+FMT_DIM#
+
+ +
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+

Typedefs

+
+
+using SizeType32 = std::int32_t#
+
+ +
+
+using SizeType64 = std::int64_t#
+
+ +
+
+using TokenIdType = std::int32_t#
+
+ +
+
+using LoraTaskIdType = std::uint64_t#
+
+ +
+
+using TokenExtraIdType = std::uint64_t#
+
+ +
+
+using VecTokenExtraIds = std::vector<TokenExtraIdType>#
+
+ +
+
+using CacheSaltIDType = std::uint64_t#
+
+ +
+
+using VecUniqueTokens = std::vector<UniqueToken>#
+
+ +
+
+template<typename T>
using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>#
+
+ +
+
+

Enums

+
+
+enum class RequestType : std::int32_t#
+

Values:

+
+
+enumerator kCONTEXT#
+
+ +
+
+enumerator kGENERATION#
+
+ +
+ +
+
+
+struct UniqueToken#
+
+

Public Functions

+
+
+inline bool operator==(UniqueToken const &other) const noexcept#
+
+ +
+
+

Public Members

+
+
+TokenIdType tokenId#
+
+ +
+
+TokenExtraIdType tokenExtraId#
+
+ +
+
+ +
+ +
+ +
+
+

loraCachePageManagerConfig.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+

Functions

+
+
+inline std::ostream &operator<<( + +
+
std::ostream &os,
+
LoraCachePageManagerConfig const &c,
+
+ +)#
+
+ +
+
+inline std::string to_string(LoraCachePageManagerConfig const &c)#
+
+ +
+
+
+class LoraCachePageManagerConfig#
+
+#include <loraCachePageManagerConfig.h>
+

Configuration for LoraCachePageManager

+

See LoraCache docs for description of pages, slots, and page blocks.

+
+

Public Functions

+
+
+inline explicit constexpr LoraCachePageManagerConfig( + +
+
runtime::MemoryType memType,
+
nvinfer1::DataType dType,
+
SizeType32 totalNumPages,
+
SizeType32 maxPagesPerBlock,
+
SizeType32 slotsPerPage,
+
SizeType32 pageWidth,
+
SizeType32 numCopyStreams,
+
+ +)#
+
+ +
+
+inline runtime::MemoryType constexpr getMemoryType() const noexcept#
+
+ +
+
+inline void constexpr setMemoryType( + +
+
runtime::MemoryType const &memoryType,
+
+ +) noexcept#
+
+ +
+
+inline nvinfer1::DataType constexpr getDataType() const noexcept#
+
+ +
+
+inline void constexpr setDataType( + +
+
nvinfer1::DataType const &dtype,
+
+ +) noexcept#
+
+ +
+
+inline SizeType32 constexpr getTotalNumPages() const noexcept#
+
+ +
+
+inline void constexpr setTotalNumPage( + +
+
SizeType32 const &totalNumPages,
+
+ +) noexcept#
+
+ +
+
+inline SizeType32 constexpr getMaxPagesPerBlock() const noexcept#
+
+ +
+
+inline void constexpr setMaxPagesPerBlock( + +
+
SizeType32 const &maxPagesPerBlock,
+
+ +) noexcept#
+
+ +
+
+inline SizeType32 constexpr getSlotsPerPage() const noexcept#
+
+ +
+
+inline void constexpr setSlotsPerPage( + +
+
SizeType32 const &slotsPerPage,
+
+ +) noexcept#
+
+ +
+
+inline SizeType32 constexpr getPageWidth() const noexcept#
+
+ +
+
+inline void constexpr setPageWidth( + +
+
SizeType32 const &pageWidth,
+
+ +) noexcept#
+
+ +
+
+inline bool constexpr getInitToZero() const noexcept#
+
+ +
+
+inline void constexpr setInitToZero(bool initToZero) noexcept#
+
+ +
+
+inline SizeType32 constexpr getNumCopyStreams() const noexcept#
+
+ +
+
+inline void constexpr setNumCopyStreams( + +
+
SizeType32 numCopyStreams,
+
+ +) noexcept#
+
+ +
+
+

Private Members

+
+
+runtime::MemoryType mMemoryType#
+
+ +
+
+nvinfer1::DataType mDataType#
+
+ +
+
+SizeType32 mTotalNumPages#
+
+ +
+
+SizeType32 mMaxPagesPerBlock#
+
+ +
+
+SizeType32 mSlotsPerPage#
+
+ +
+
+SizeType32 mPageWidth#
+
+ +
+
+SizeType32 mNumCopyStreams = 1#
+
+ +
+
+bool mInitToZero#
+
+ +
+
+ +
+ +
+ +
+
+

worldConfig.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class WorldConfig#
+
+

Public Functions

+
+
+explicit WorldConfig( + +
+
SizeType32 tensorParallelism = 1,
+
SizeType32 pipelineParallelism = 1,
+
SizeType32 contextParallelism = 1,
+
SizeType32 rank = 0,
+
SizeType32 gpusPerNode = kDefaultGpusPerNode,
+
std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt,
+
bool enableAttentionDP = false,
+
+ +)#
+
+ +
+
+inline SizeType32 constexpr getSize() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getTensorParallelism() const noexcept#
+
+ +
+
+inline bool constexpr isTensorParallel() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getPipelineParallelism() const noexcept#
+
+ +
+
+inline bool constexpr isPipelineParallel() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getContextParallelism() const noexcept#
+
+ +
+
+inline bool constexpr isContextParallel() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getGpusPerNode() const noexcept#
+
+ +
+
+inline SizeType32 getGpusPerGroup() const noexcept#
+
+ +
+
+inline SizeType32 getDevice() const noexcept#
+
+ +
+
+inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept#
+
+ +
+
+inline SizeType32 constexpr getPipelineParallelRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getTensorParallelRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getContextParallelRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getLocalRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getNodeRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getNodeRankOf( + +
+
SizeType32 rank,
+
+ +) const noexcept#
+
+ +
+
+inline bool constexpr isFirstPipelineParallelRank() const noexcept#
+
+ +
+
+inline bool constexpr isLastPipelineParallelRank() const noexcept#
+

Is my rank the last rank in its pipeline?

+
+ +
+
+inline bool constexpr isFirstTensorParallelRank() const noexcept#
+
+ +
+
+inline bool constexpr isFirstContextParallelRank() const noexcept#
+
+ +
+
+inline SizeType32 constexpr getLastRank() const noexcept#
+
+ +
+
+inline bool constexpr enableAttentionDP() const noexcept#
+
+ +
+
+std::vector<SizeType32> getPipelineParallelGroup() const#
+
+ +
+
+std::vector<SizeType32> getTensorParallelGroup() const#
+
+ +
+
+std::vector<SizeType32> getContextParallelGroup() const#
+
+ +
+
+bool validMpiConfig() const#
+
+ +
+
+

Public Static Functions

+
+
+static WorldConfig mpi( + +
+
SizeType32 gpusPerNode = kDefaultGpusPerNode,
+
std::optional<SizeType32> tensorParallelism = std::nullopt,
+
std::optional<SizeType32> pipelineParallelism = std::nullopt,
+
std::optional<SizeType32> contextParallelism = std::nullopt,
+
std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt,
+
bool enableAttentionDP = false,
+
+ +)#
+
+ +
+
+

Public Static Attributes

+
+
+static SizeType32 constexpr kDefaultGpusPerNode = 1#
+
+ +
+
+

Private Members

+
+
+SizeType32 mTensorParallelism#
+
+ +
+
+SizeType32 mPipelineParallelism#
+
+ +
+
+SizeType32 mContextParallelism#
+
+ +
+
+SizeType32 mRank#
+
+ +
+
+SizeType32 mGpusPerNode#
+
+ +
+
+bool mEnableAttentionDP#
+
+ +
+
+std::vector<SizeType32> mDeviceIds#
+
+ +
+
+ +
+ +
+ +
+
+

loraModule.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+

Functions

+
+
+inline std::ostream &operator<<( + +
+
std::ostream &output,
+
LoraModule const &module,
+
+ +)#
+
+ +
+
+
+class LoraModule#
+
+

Public Types

+
+
+enum class ModuleType : SizeType32#
+

Values:

+
+
+enumerator kINVALID#
+
+ +
+
+enumerator kATTN_QKV#
+
+ +
+
+enumerator kATTN_Q#
+
+ +
+
+enumerator kATTN_K#
+
+ +
+
+enumerator kATTN_V#
+
+ +
+
+enumerator kATTN_DENSE#
+
+ +
+
+enumerator kMLP_H_TO_4H#
+
+ +
+
+enumerator kMLP_4H_TO_H#
+
+ +
+
+enumerator kMLP_GATE#
+
+ +
+
+enumerator kCROSS_ATTN_QKV#
+
+ +
+
+enumerator kCROSS_ATTN_Q#
+
+ +
+
+enumerator kCROSS_ATTN_K#
+
+ +
+
+enumerator kCROSS_ATTN_V#
+
+ +
+
+enumerator kCROSS_ATTN_DENSE#
+
+ +
+
+enumerator kMOE_H_TO_4H#
+
+ +
+
+enumerator kMOE_4H_TO_H#
+
+ +
+
+enumerator kMOE_GATE#
+
+ +
+
+enumerator kMOE_ROUTER#
+
+ +
+
+enumerator kMLP_ROUTER#
+
+ +
+
+enumerator kMLP_GATE_UP#
+
+ +
+ +
+
+using TensorPtr = ITensor::SharedPtr#
+
+ +
+
+

Public Functions

+
+
+inline explicit constexpr LoraModule( + +
+
ModuleType const &t,
+
SizeType32 inDim,
+
SizeType32 outDim,
+
bool inDimFirst,
+
bool outDimFirst,
+
SizeType32 inTpSplitDim,
+
SizeType32 outTpSplitDim,
+
+ +) noexcept#
+
+ +
+
+inline explicit constexpr LoraModule() noexcept#
+
+ +
+
+explicit constexpr LoraModule(LoraModule const &o) = default#
+
+ +
+
+constexpr LoraModule &operator=(LoraModule const &o) = default#
+
+ +
+
+inline SizeType32 constexpr flattenedInOutSize( + +
+
SizeType32 adapterSize,
+
bool isDora,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr inSize( + +
+
SizeType32 adapterSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr outSize( + +
+
SizeType32 adapterSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localInSize( + +
+
SizeType32 adapterSize,
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localOutSize( + +
+
SizeType32 adapterSize,
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localScalesSize( + +
+
SizeType32 tpSize,
+
bool isDora,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localInDim( + +
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localOutDim( + +
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localInAdapterSize( + +
+
SizeType32 adapterSize,
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localOutAdapterSize( + +
+
SizeType32 adapterSize,
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localInOutSize( + +
+
SizeType32 adapterSize,
+
SizeType32 tpSize,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr localTotalSize( + +
+
SizeType32 adapterSize,
+
SizeType32 tpSize,
+
bool isDora,
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 constexpr value() const noexcept#
+
+ +
+
+inline std::string_view constexpr name() const noexcept#
+
+ +
+
+inline SizeType32 constexpr inDim() const noexcept#
+
+ +
+
+inline SizeType32 constexpr outDim() const noexcept#
+
+ +
+
+inline bool constexpr inDimFirst() const noexcept#
+
+ +
+
+inline bool constexpr outDimFirst() const noexcept#
+
+ +
+
+inline SizeType32 constexpr inTpSplitDim() const noexcept#
+
+ +
+
+inline SizeType32 constexpr outTpSplitDim() const noexcept#
+
+ +
+
+

Public Static Functions

+
+
+static std::vector<LoraModule> createLoraModules( + +
+
std::vector<std::string> const &loraModuleNames,
+
SizeType32 hiddenSize,
+
SizeType32 mlpHiddenSize,
+
SizeType32 numAttentionHeads,
+
SizeType32 numKvAttentionHeads,
+
SizeType32 attentionHeadSize,
+
SizeType32 tpSize,
+
SizeType32 numExperts,
+
+ +)#
+
+ +
+
+static inline ModuleType constexpr toModuleType( + +
+
std::string_view const &name,
+
+ +)#
+
+ +
+
+static inline std::string_view constexpr toModuleName( + +
+
ModuleType t,
+
+ +) noexcept#
+
+ +
+
+static inline std::string_view constexpr toModuleName(SizeType32 id)#
+
+ +
+
+

Private Members

+
+
+ModuleType mType#
+
+ +
+
+SizeType32 mInDim#
+
+ +
+
+SizeType32 mOutDim#
+
+ +
+
+bool mInDimFirst#
+
+ +
+
+bool mOutDimFirst#
+
+ +
+
+SizeType32 mInTpSplitDim#
+
+ +
+
+SizeType32 mOutTpSplitDim#
+
+ +
+
+ +
+ +
+ +
+
+

speculativeDecodingMode.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class SpeculativeDecodingMode#
+
+

Public Types

+
+
+using UnderlyingType = std::uint8_t#
+
+ +
+
+

Public Functions

+
+
+inline bool constexpr isNone() const#
+
+ +
+
+inline bool constexpr isDraftTokensExternal() const#
+
+ +
+
+inline bool constexpr isMedusa() const#
+
+ +
+
+inline bool constexpr isLookaheadDecoding() const#
+
+ +
+
+inline bool constexpr isExplicitDraftTokens() const#
+
+ +
+
+inline bool constexpr isEagle() const#
+
+ +
+
+inline bool constexpr updatesPositionIds() const#
+
+ +
+
+inline bool constexpr requiresAttentionMask() const#
+
+ +
+
+inline bool constexpr predictsDraftTokens() const#
+
+ +
+
+inline bool constexpr needsKVCacheRewind() const#
+
+ +
+
+inline bool constexpr variableDraftLength() const#
+
+ +
+
+inline bool constexpr hasDraftLogits() const#
+
+ +
+
+inline bool constexpr needsDecoderPrologue() const#
+
+ +
+
+inline bool operator==(SpeculativeDecodingMode const &other) const#
+
+ +
+
+inline explicit constexpr SpeculativeDecodingMode( + +
+
UnderlyingType state,
+
+ +)#
+
+ +
+
+

Public Static Functions

+
+
+static inline auto constexpr None()#
+
+ +
+
+static inline auto constexpr DraftTokensExternal()#
+
+ +
+
+static inline auto constexpr Medusa()#
+
+ +
+
+static inline auto constexpr LookaheadDecoding()#
+
+ +
+
+static inline auto constexpr ExplicitDraftTokens()#
+
+ +
+
+static inline auto constexpr Eagle()#
+
+ +
+
+

Private Functions

+
+
+inline bool constexpr anyBitSet(UnderlyingType bits) const#
+
+ +
+
+inline bool constexpr allBitSet(UnderlyingType bits) const#
+
+ +
+
+

Private Members

+
+
+UnderlyingType mState = {kNone}#
+
+ +
+
+

Private Static Attributes

+
+
+static UnderlyingType constexpr kNone = {1U << 0U}#
+
+ +
+
+static UnderlyingType constexpr kDraftTokensExternal = {1U << 1U}#
+
+ +
+
+static UnderlyingType constexpr kMedusa = {1U << 2U}#
+
+ +
+
+static UnderlyingType constexpr kLookaheadDecoding = {1U << 3U}#
+
+ +
+
+static UnderlyingType constexpr kExplicitDraftTokens = {1U << 4U}#
+
+ +
+
+static UnderlyingType constexpr kEagle = {1U << 5U}#
+
+ +
+
+ +
+ +
+ +
+
+

cudaEvent.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class CudaEvent#
+
+

Public Types

+
+
+using pointer = cudaEvent_t#
+
+ +
+
+

Public Functions

+
+
+inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)#
+

Creates a new cuda event. The event will be destroyed in the destructor.

+
+
Parameters:
+

flags – Flags for event creation. By default, event timing is disabled.

+
+
+
+ +
+
+inline explicit CudaEvent(pointer event, bool ownsEvent = true)#
+

Pass an existing cuda event to this object.

+
+
Parameters:
+
    +
  • event – The event to pass to this object.

  • +
  • ownsEvent – Whether this object owns the event and destroys it in the destructor.

  • +
+
+
+
+ +
+
+inline pointer get() const#
+

Returns the event associated with this object.

+
+ +
+
+inline void synchronize() const#
+

Synchronizes the event.

+
+ +
+
+

Private Types

+
+
+using element_type = std::remove_pointer_t<pointer>#
+
+ +
+
+using EventPtr = std::unique_ptr<element_type, Deleter>#
+
+ +
+
+

Private Members

+
+
+EventPtr mEvent#
+
+ +
+
+
+class Deleter#
+
+

Public Functions

+
+
+inline explicit Deleter(bool ownsEvent)#
+
+ +
+
+inline explicit Deleter()#
+
+ +
+
+inline constexpr void operator()(pointer event) const#
+
+ +
+
+

Private Members

+
+
+bool mOwnsEvent#
+
+ +
+
+ +
+ +
+ +
+ +
+
+

decodingInput.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class DecodingInput#
+
+#include <decodingInput.h>
+

Represents the inputs to the decoder.

+

This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.

+
+

Public Types

+
+
+using TensorConstPtr = ITensor::SharedConstPtr#
+
+ +
+
+using TensorPtr = ITensor::SharedPtr#
+
+ +
+
+

Public Functions

+
+
+DecodingInput() = default#
+
+ +
+
+

Public Members

+
+
+SizeType32 step = {}#
+

Mandatory parameters The index of the decoding step we are on. Only used in Python runtime

+
+ +
+
+SizeType32 maxLength = {}#
+

The maximum number of tokens to decode.

+
+ +
+
+SizeType32 maxAttentionWindow = {}#
+

The maximum length of the attention window to consider while decoding.

+
+ +
+
+SizeType32 sinkTokenLength = {}#
+

The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3.

+
+ +
+
+SizeType32 batchSize = {}#
+

The number of samples in the batch.

+
+ +
+
+std::vector<SizeType32> beamWidths#
+

The beam widths of each request, [batchSize].

+
+ +
+
+SizeType32 maxStopWordsLen = {}#
+

The maximum value in the stopWordsLens tensor.

+
+ +
+
+SizeType32 maxBadWordsLen = {}#
+

The maximum value in the badWordsLens tensor.

+
+ +
+
+std::vector<TensorConstPtr> logitsVec#
+

The output of the model forward computation, a probability distribution over the vocabulary [batchSize][numGenTokens, beamWidth, vocabSizePadded] on gpu

+
+ +
+
+TensorConstPtr endIds#
+

The end ids, [batchSize * beamWidth] on gpu.

+
+ +
+
+TensorConstPtr batchSlots#
+

Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t.

+
+ +
+
+TensorConstPtr finishReasons#
+

Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu

+
+ +
+
+TensorConstPtr sequenceLimitLength#
+

The maximum sequence length for each sequence in the batch, [batchSize] on gpu.

+
+ +
+
+TensorConstPtr embeddingBias#
+
+ +
+
+TensorConstPtr lengths#
+
+ +
+
+std::vector<TensorPtr> badWordsLists#
+
+ +
+
+TensorConstPtr badWordsPtrs#
+
+ +
+
+TensorConstPtr badWordsLens#
+
+ +
+
+std::vector<TensorPtr> stopWordsLists#
+
+ +
+
+TensorConstPtr stopWordsPtrs#
+
+ +
+
+TensorConstPtr stopWordsLens#
+
+ +
+
+TensorConstPtr noRepeatNgramSize#
+
+ +
+
+TensorPtr cacheIndirection#
+

Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu

+
+ +
+
+std::optional<std::vector<SizeType32>> generationSteps#
+

Steps of each request, for Variable-Beam-Width-Search, [batchSize].

+
+ +
+
+std::optional<MedusaInputs> medusaInputs#
+
+ +
+
+std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs#
+
+ +
+
+std::optional<LookaheadInputs> lookaheadInputs#
+
+ +
+
+std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs#
+
+ +
+
+std::optional<EagleInputs> eagleInputs#
+
+ +
+
+
+struct EagleInputs#
+
+

Public Members

+
+
+TensorConstPtr nextDraftTokens#
+
+ +
+
+TensorConstPtr nextDraftLens#
+
+ +
+
+TensorConstPtr nextDraftPaths#
+
+ +
+
+TensorConstPtr lastDraftTokens#
+
+ +
+
+TensorConstPtr lastDraftLens#
+
+ +
+
+TensorConstPtr lastDraftPaths#
+
+ +
+
+TensorConstPtr acceptedTokens#
+
+ +
+
+TensorConstPtr acceptedLens#
+
+ +
+
+TensorConstPtr acceptedPathIds#
+
+ +
+
+TensorConstPtr chunkedContextNextTokens#
+
+ +
+
+TensorConstPtr seqSlots#
+
+ +
+
+ +
+
+class ExplicitDraftTokensInputs#
+
+

Public Members

+
+
+TensorConstPtr nextDraftTokens#
+
+ +
+
+TensorConstPtr nextFlatTokens#
+
+ +
+
+TensorConstPtr nextDraftIndices#
+
+ +
+
+TensorConstPtr nextDraftProbs#
+
+ +
+
+TensorConstPtr lastDraftTokens#
+
+ +
+
+TensorConstPtr lastDraftIndices#
+
+ +
+
+TensorConstPtr masks#
+
+ +
+
+TensorConstPtr packedPositionIds#
+
+ +
+
+TensorConstPtr bestPathLengths#
+
+ +
+
+TensorConstPtr bestPathIndices#
+
+ +
+
+TensorConstPtr nextGenerationLengths#
+
+ +
+
+TensorConstPtr lastPositionIdsBase#
+
+ +
+
+TensorConstPtr lastGenerationLengths#
+
+ +
+
+TensorConstPtr maxGenLengthDevice#
+
+ +
+
+TensorConstPtr seqSlots#
+
+ +
+
+ +
+
+class ExternalDraftTokensInputs#
+
+

Public Members

+
+
+TensorPtr draftLogits#
+
+ +
+
+TensorPtr draftLogitsHost#
+
+ +
+
+TensorPtr draftProbs#
+
+ +
+
+TensorPtr targetProbs#
+
+ +
+
+TensorPtr numDraftTokens#
+
+ +
+
+TensorPtr numDraftTokensHost#
+
+ +
+
+TensorPtr draftTokenIds#
+
+ +
+
+TensorPtr draftTokenIdsHost#
+
+ +
+
+TensorPtr useDraftLogits#
+
+ +
+
+TensorPtr useDraftLogitsHost#
+
+ +
+
+SizeType32 step#
+
+ +
+
+float constantThreshold#
+
+ +
+
+bool useRandomAcceptanceThreshold#
+
+ +
+
+ +
+
+struct LookaheadInputs#
+
+

Public Members

+
+
+TensorPtr tokensPerStep#
+
+ +
+
+ +
+
+class MedusaInputs#
+
+

Public Members

+
+
+TensorConstPtr medusaPaths#
+

[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu

+
+ +
+
+TensorConstPtr medusaTreeIds#
+

[batchSize, maxTokensPerStep], on gpu

+
+ +
+
+std::vector<std::vector<TensorPtr>> medusaLogits#
+

[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu

+
+ +
+
+TensorPtr medusaCurTokensPerStep#
+

[batchSize], on gpu

+
+ +
+
+TensorConstPtr medusaTargetTokensPerStep#
+

[batchSize], on gpu

+
+ +
+
+ +
+ +
+ +
+ +
+
+

speculativeDecodingModule.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class SpeculativeDecodingModule#
+

Subclassed by tensorrt_llm::runtime::EagleModule, tensorrt_llm::runtime::LookaheadModule, tensorrt_llm::runtime::MedusaModule

+
+

Public Functions

+
+
+inline explicit SpeculativeDecodingModule( + +
+
SizeType32 maxDraftPathLen,
+
SizeType32 maxDecodingDraftTokens,
+
SizeType32 maxNumPaths,
+
+ +) noexcept#
+
+ +
+
+inline explicit SpeculativeDecodingModule() noexcept#
+
+ +
+
+virtual ~SpeculativeDecodingModule() = default#
+
+ +
+
+SpeculativeDecodingModule(SpeculativeDecodingModule const &o) = default#
+
+ +
+
+SpeculativeDecodingModule &operator=( + +
+
SpeculativeDecodingModule const &o,
+
+ +) = default#
+
+ +
+
+inline SizeType32 getMaxDraftPathLen() const noexcept#
+
+
Returns:
+

max number of draft tokens that can be accepted by one step of the decoder

+
+
+
+ +
+
+inline SizeType32 getMaxPathLen() const noexcept#
+

+one more than draft path len for prediction from primary head

+
+
Returns:
+

max number of tokens that a request can grow in one step of the decoder

+
+
+
+ +
+
+inline SizeType32 getMaxDecodingDraftTokens() const noexcept#
+
+
Returns:
+

max number of draft tokens processed by one step of the decoder

+
+
+
+ +
+
+inline SizeType32 getMaxDecodingTokens() const noexcept#
+

+one more than decoding draft tokens for prediction from primary head

+
+
Returns:
+

max number of tokens processed by one step of the decoder

+
+
+
+ +
+
+inline SizeType32 getNumPackedMasks() const noexcept#
+
+ +
+
+inline SizeType32 getMaxNumPaths() const noexcept#
+
+ +
+
+inline void setMaxDraftTokens(SizeType32 maxDraftTokens) noexcept#
+
+ +
+
+inline void setMaxDraftPathLen(SizeType32 maxDraftPathLen) noexcept#
+
+ +
+
+inline void setMaxNumPaths(SizeType32 maxNumPaths) noexcept#
+
+ +
+
+

Private Functions

+
+
+inline void computeNumPackedMasks() noexcept#
+
+ +
+
+

Private Members

+
+
+SizeType32 mMaxDraftPathLen#
+
+ +
+
+SizeType32 mMaxDecodingDraftTokens#
+
+ +
+
+SizeType32 mMaxNumPaths#
+
+ +
+
+SizeType32 mMaxNumPackedMasks#
+
+ +
+
+ +
+ +
+ +
+
+

iGptDecoderBatched.h#

+
+
+namespace tensorrt_llm
+
+
+namespace batch_manager
+
+ +
+
+namespace runtime
+
+
+class IGptDecoderBatched#
+
+#include <iGptDecoderBatched.h>
+

GPT decoder class with support for in-flight batching.

+

Subclassed by tensorrt_llm::runtime::GptDecoderBatched

+
+

Public Types

+
+
+using CudaStreamPtr = std::shared_ptr<CudaStream>#
+
+ +
+
+using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
+
+ +
+
+using RequestVector = std::vector<LlmRequestPtr>#
+
+ +
+
+using TensorPtr = std::shared_ptr<ITensor>#
+
+ +
+
+

Public Functions

+
+
+virtual void setup( + +
+
executor::DecodingMode const &mode,
+
SizeType32 maxNumSequences,
+
SizeType32 maxBeamWidth,
+
nvinfer1::DataType dtype,
+
ModelConfig const &modelConfig,
+
WorldConfig const &worldConfig,
+
+ +) = 0#
+

Setup the decoder before calling forward()

+
+ +
+
+virtual void disableLookahead( + +
+
RequestVector const &genRequests,
+
TensorPtr const &batchSlots,
+
+ +) = 0#
+

Disable Lookahead decoding.

+
+ +
+
+virtual CudaEvent forwardAsync( + +
+
decoder::DecoderState const &decoderState,
+
batch_manager::DecoderInputBuffers const &input,
+
+ +) = 0#
+

Run one step for all requests without blocking the host process and return the token for synchronization.

+
+ +
+
+virtual void forward( + +
+
decoder::DecoderState const &decoderState,
+
batch_manager::DecoderInputBuffers const &input,
+
+ +) = 0#
+

Run one step for all requests and wait for completion on the host.

+
+ +
+
+virtual CudaEvent finalize( + +
+
decoder::DecoderState const &decoderState,
+
SizeType32 batchSlot,
+
SamplingConfig const &samplingConfig,
+
bool streaming,
+
+ +) const = 0#
+

Gather final beam search results for request batchIdx. Result will only be available after event returned.

+
+ +
+
+

Protected Functions

+
+
+IGptDecoderBatched() = default#
+
+ +
+
+virtual ~IGptDecoderBatched() = default#
+
+ +
+
+ +
+
+namespace decoder#
+
+ +
+ +
+ +
+
+

eagleModule.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class EagleModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
+
+

Public Functions

+
+
+inline explicit EagleModule( + +
+
SizeType32 maxDraftPathLen,
+
SizeType32 maxDecodingDraftTokens,
+
SizeType32 numTransformersLayer,
+
SizeType32 maxNonLeafNodesPerLayer,
+
+ +) noexcept#
+
+ +
+
+inline explicit EagleModule() noexcept#
+
+ +
+
+inline executor::EagleChoices const &getDefaultEagleChoices( + +
+
+ +) const noexcept#
+
+ +
+
+inline SizeType32 getNumTransformerLayers() const noexcept#
+
+ +
+
+inline SizeType32 getMaxNonLeafNodesPerLayer() const noexcept#
+
+ +
+
+

Private Members

+
+
+SizeType32 mNumTransformersLayer#
+
+ +
+
+SizeType32 mMaxNonLeafNodesPerLayer#
+
+ +
+
+executor::EagleChoices mDefaultEagleChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}#
+
+ +
+
+ +
+ +
+ +
+
+

tllmLogger.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class TllmLogger : public nvinfer1::ILogger#
+
+

Public Functions

+
+
+void log( + +
+
Severity severity,
+
nvinfer1::AsciiChar const *msg,
+
+ +) noexcept override#
+
+ +
+
+Severity getLevel()#
+
+ +
+
+void setLevel(Severity level)#
+
+ +
+
+ +
+ +
+ +
+
+

gptDecoderBatched.h#

+
+
+namespace tensorrt_llm
+
+
+namespace batch_manager
+
+ +
+
+namespace runtime
+
+
+class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched#
+
+#include <gptDecoderBatched.h>
+

GPT decoder class with support for in-flight batching.

+
+

Public Types

+
+
+using CudaStreamPtr = std::shared_ptr<CudaStream>#
+
+ +
+
+using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
+
+ +
+
+using RequestVector = std::vector<LlmRequestPtr>#
+
+ +
+
+using TensorPtr = ITensor::SharedPtr#
+
+ +
+
+

Public Functions

+
+
+explicit GptDecoderBatched(CudaStreamPtr stream)#
+
+ +
+
+virtual void setup( + +
+
executor::DecodingMode const &mode,
+
SizeType32 maxNumSequences,
+
SizeType32 maxBeamWidth,
+
nvinfer1::DataType dtype,
+
ModelConfig const &modelConfig,
+
WorldConfig const &worldConfig,
+
+ +) override#
+

Setup the decoder before calling forward()

+
+ +
+
+virtual void disableLookahead( + +
+
RequestVector const &genRequests,
+
TensorPtr const &batchSlots,
+
+ +) override#
+

Disable Lookahead decoding.

+
+ +
+
+virtual CudaEvent forwardAsync( + +
+
decoder::DecoderState const &decoderState,
+
batch_manager::DecoderInputBuffers const &input,
+
+ +) override#
+

Run one step for all requests without blocking the host process and return the token for synchronization.

+
+ +
+
+virtual void forward( + +
+
decoder::DecoderState const &decoderState,
+
batch_manager::DecoderInputBuffers const &input,
+
+ +) override#
+

Run one step for all requests and wait for completion on the host.

+
+ +
+
+virtual CudaEvent finalize( + +
+
decoder::DecoderState const &decoderState,
+
SizeType32 batchSlot,
+
SamplingConfig const &samplingConfig,
+
bool streaming,
+
+ +) const override#
+

Gather final beam search results for request batchSlot. Result will only be available after event returned.

+
+ +
+
+inline CudaStreamPtr getDecoderStream() const#
+
+ +
+
+inline IGptDecoder &getUnderlyingDecoder() const#
+
+ +
+
+inline BufferManager const &getBufferManager() const#
+
+ +
+
+

Private Types

+
+
+using GptDecoderPtr = std::unique_ptr<IGptDecoder>#
+
+ +
+
+

Private Functions

+
+
+void forwardDispatch( + +
+
decoder::DecoderState const &decoderState,
+
batch_manager::DecoderInputBuffers const &input,
+
+ +)#
+

Calls decoders for tokens per engine step.

+
+ +
+
+

Private Members

+
+
+CudaStreamPtr mRuntimeStream#
+
+ +
+
+CudaStreamPtr mDecoderStream#
+
+ +
+
+BufferManager mBufferManager#
+
+ +
+
+GptDecoderPtr mDecoder#
+
+ +
+
+ +
+ +
+ +
+
+

cudaStream.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class CudaStream#
+
+

Public Functions

+
+
+inline explicit CudaStream( + +
+
unsigned int flags = cudaStreamNonBlocking,
+
int priority = 0,
+
+ +)#
+

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

+
+
Parameters:
+
    +
  • flags – Flags for stream creation. See cudaStreamCreateWithFlags for a list of valid flags that can be passed.

  • +
  • priority – Priority of the stream. Lower numbers represent higher priorities. See cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

  • +
+
+
+
+ +
+
+inline explicit CudaStream( + +
+
cudaStream_t stream,
+
int device,
+
bool ownsStream = true,
+
+ +)#
+

Pass an existing cuda stream to this object.

+
+
Parameters:
+
    +
  • stream – The stream to pass to this object.

  • +
  • device – The device on which the stream was created.

  • +
  • ownsStream – Whether this object owns the stream and destroys it in the destructor.

  • +
+
+
+
+ +
+
+inline explicit CudaStream(cudaStream_t stream)#
+

Construct with an existing cuda stream or the default stream by passing nullptr.

+
+ +
+
+inline int getDevice() const#
+

Returns the device on which the stream was created.

+
+ +
+
+inline cudaStream_t get() const#
+

Returns the stream associated with this object.

+
+ +
+
+inline void synchronize() const#
+

Synchronizes the stream.

+
+ +
+
+inline void record(CudaEvent::pointer event) const#
+

Record an event on the stream.

+
+ +
+
+inline void record(CudaEvent const &event) const#
+

Record an event on the stream.

+
+ +
+
+inline void wait(CudaEvent::pointer event) const#
+

Wait for an event.

+
+ +
+
+inline void wait(CudaEvent const &event) const#
+

Wait for an event.

+
+ +
+
+

Private Types

+
+
+using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>#
+
+ +
+
+

Private Members

+
+
+StreamPtr mStream#
+
+ +
+
+int mDevice = {-1}#
+
+ +
+
+
+class Deleter#
+
+

Public Functions

+
+
+inline explicit Deleter(bool ownsStream)#
+
+ +
+
+inline explicit Deleter()#
+
+ +
+
+inline constexpr void operator()(cudaStream_t stream) const#
+
+ +
+
+

Private Members

+
+
+bool mOwnsStream#
+
+ +
+
+ +
+ +
+ +
+ +
+
+

ipcNvlsMemory.h#

+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+

Functions

+
+
+void MPI_group_barrier(std::set<int> ranks)#
+
+ +
+
+bool ipcNvlsSupported()#
+
+ +
+
+IpcNvlsHandle *ipcNvlsAllocate(size_t size, std::set<int> ranks)#
+
+ +
+
+void ipcNvlsFree(IpcNvlsHandle *handle)#
+
+ +
+
+
+template<typename T>
class DeviceAllocationNvls#
+
+

Public Functions

+
+
+DeviceAllocationNvls() = default#
+
+ +
+
+inline ~DeviceAllocationNvls()#
+
+ +
+
+inline void reset(size_t size, std::set<int> ranks)#
+
+ +
+
+inline T *getMulticastPointer() const#
+
+ +
+
+inline T *getUnicastPointer() const#
+
+ +
+
+inline T **getIpcUnicastPointers()#
+
+ +
+
+inline size_t getCapacity() const#
+
+ +
+
+inline void free()#
+
+ +
+
+

Private Members

+
+
+size_t _capacity = 0#
+
+ +
+
+IpcNvlsHandle *_handle#
+
+ +
+
+ +
+
+struct IpcNvlsHandle#
+
+

Public Members

+
+
+size_t size = 0#
+
+ +
+
+uintptr_t uc_ptr = 0#
+
+ +
+
+uintptr_t mc_ptr = 0#
+
+ +
+
+std::vector<uintptr_t> ipc_uc_ptrs#
+
+ +
+
+CUdeviceptr uc_va#
+
+ +
+
+CUdeviceptr mc_va#
+
+ +
+
+std::vector<CUdeviceptr> ipc_uc_vas#
+
+ +
+
+CUmemGenericAllocationHandle uc_handle#
+
+ +
+
+CUmemGenericAllocationHandle mc_handle#
+
+ +
+
+std::vector<CUmemGenericAllocationHandle> ipc_uc_handles#
+
+ +
+
+ +
+ +
+ +
+
+

samplingConfig.h#

+
+

Defines

+
+
+SET_FROM_OPTIONAL(varName, VarName, VarType)#
+
+ +
+
+
+namespace tensorrt_llm
+
+
+namespace runtime
+
+
+class SamplingConfig#
+
+

Public Functions

+
+
+inline explicit SamplingConfig(SizeType32 beamWidth = 1)#
+
+ +
+
+inline explicit SamplingConfig( + +
+
std::vector<SamplingConfig> const &configs,
+
+ +)#
+
+ +
+
+inline explicit SamplingConfig( + +
+
executor::SamplingConfig const &samplingConfig,
+
std::optional<executor::ExternalDraftTokensConfig> const &externalDraftTokensConfig = std::nullopt,
+
+ +)#
+
+ +
+
+inline bool validate()#
+
+ +
+
+template<typename T>
inline bool useDefaultValues( + +
+
OptVec<T> const &vec,
+
T defaultValue,
+
+ +)#
+
+ +
+
+inline bool operator==(SamplingConfig const &other) const#
+
+ +
+
+inline SizeType32 getNumReturnBeams() const#
+
+ +
+
+inline SizeType32 getMaxBeamWidth() const noexcept#
+
+ +
+
+

Public Members

+
+
+SizeType32 beamWidth#
+
+ +
+
+std::optional<SizeType32> numReturnSequences#
+
+ +
+
+OptVec<FloatType> temperature#
+
+ +
+
+OptVec<FloatType> originalTemperature#
+
+ +
+
+OptVec<SizeType32> minLength#
+
+ +
+
+OptVec<FloatType> repetitionPenalty#
+
+ +
+
+OptVec<FloatType> presencePenalty#
+
+ +
+
+OptVec<FloatType> frequencyPenalty#
+
+ +
+
+OptVec<SizeType32> promptIgnoreLength#
+
+ +
+
+OptVec<SizeType32> noRepeatNgramSize#
+
+ +
+
+OptVec<bool> outputLogProbs#
+
+ +
+
+OptVec<bool> cumLogProbs#
+
+ +
+
+OptVec<SizeType32> topK#
+
+ +
+
+OptVec<FloatType> topP#
+
+ +
+
+OptVec<uint64_t> randomSeed#
+
+ +
+
+OptVec<FloatType> topPDecay#
+
+ +
+
+OptVec<FloatType> topPMin#
+
+ +
+
+OptVec<TokenIdType> topPResetIds#
+
+ +
+
+OptVec<FloatType> minP#
+
+ +
+
+OptVec<FloatType> beamSearchDiversityRate#
+
+ +
+
+OptVec<FloatType> lengthPenalty#
+
+ +
+
+OptVec<SizeType32> earlyStopping#
+
+ +
+
+OptVec<std::vector<SizeType32>> beamWidthArray#
+
+ +
+
+OptVec<FloatType> draftAcceptanceThreshold#
+
+ +
+
+OptVec<std::vector<SizeType32>> topKMedusaHeads#
+
+ +
+
+std::optional<bool> normalizeLogProbs#
+
+ +
+
+

Private Types

+
+
+using FloatType = float#
+
+ +
+
+template<typename T>
using OptVec = std::optional<std::vector<T>>#
+
+ +
+
+

Private Functions

+
+
+template<typename T>
inline bool validateVec( + +
+
std::string name,
+
OptVec<T> const &vec,
+
T min,
+
std::optional<T> max = std::nullopt,
+
+ +)#
+
+ +
+
+

Private Static Functions

+
+
+template<typename T>
static inline OptVec<T> fuseValues( + +
+
std::vector<SamplingConfig> const &configs,
+
std::function<OptVec<T>(size_t ci)> accessor,
+
T defaultValue,
+
+ +)#
+
+ +
+
+ +
+ +
+

decoderState.h#

@@ -6526,4404 +12305,6 @@ one more than decoding draft tokens for prediction from primary head

-
-
-

lookaheadBuffers.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class LookaheadDecodingBuffers#
-
-

Public Types

-
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-

Public Functions

-
-
-LookaheadDecodingBuffers( - -
-
SizeType32 maxNumSequences,
-
SizeType32 maxTokensPerStep,
-
BufferManager const &bufferManager,
-
- -)#
-
- -
-
-

Public Members

-
-
-TensorPtr generationLengths#
-
- -
-
-TensorPtr positionOffsets#
-
- -
-
-TensorPtr packedMasks#
-
- -
-
-TensorPtr positionIds#
-
- -
-
- -
-
-class LookaheadRuntimeBuffers#
-
-

Public Types

-
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-using TensorMap = StringPtrMap<ITensor>#
-
- -
-
-

Public Functions

-
-
-LookaheadRuntimeBuffers( - -
-
SizeType32 maxBatchSize,
-
SizeType32 maxBeamWidth,
-
BufferManager const &manager,
-
ModelConfig const &modelConfig,
-
WorldConfig const &worldConfig,
-
executor::DecodingConfig const &decodingConfig,
-
TllmRuntime const &runtime,
-
- -)#
-
- -
-
-void setFromInputs( - -
-
SizeType32 numCtxSequences,
-
SizeType32 numGenSequences,
-
ITensor const &requestTypes,
-
ITensor const &seqSlots,
-
LookaheadDecodingBuffers const &decoderLookaheadBuffers,
-
TllmRuntime const &runtime,
-
ModelConfig const &modelConfig,
-
WorldConfig const &worldConfig,
-
- -) const#
-
- -
-
-void reshape( - -
-
SizeType32 numCtxSequences,
-
SizeType32 numGenSequences,
-
SizeType32 tokensPerStep,
-
- -)#
-
- -
-
-void insertInputTensors( - -
-
TensorMap &inputBuffers,
-
TensorMap &outputBuffers,
-
WorldConfig const &worldConfig,
-
- -) const#
-
- -
-
-void enableLookaheadDecoding( - -
-
SizeType32 maxBatchSize,
-
SizeType32 tokensPerStep,
-
- -)#
-
- -
-
-void disableLookaheadDecoding()#
-
- -
-
-

Public Members

-
-
-TensorPtr cumSumLength#
-
- -
-
-TensorPtr packedMasksDevice#
-
- -
-
-TensorPtr generationLengthsDevice#
-
- -
-
-TensorPtr positionOffsetsDevice#
-
- -
-
-TensorPtr positionIdsDevice#
-
- -
-
-TensorPtr packedMaskHost#
-
- -
-
-TensorPtr generationLengthsHost#
-
- -
-
-TensorPtr positionOffsetsHost#
-
- -
-
-TensorPtr positionIdsHost#
-
- -
-
-TensorPtr packedMaskHostCopy#
-
- -
-
-TensorPtr generationLengthsHostCopy#
-
- -
-
-TensorPtr positionOffsetsHostCopy#
-
- -
-
-TensorPtr positionIdsHostCopy#
-
- -
-
-TensorPtr useSpecDecoding#
-
- -
-
-TensorPtr batchSlotsHostCopy#
-
- -
-
- -
- -
- -
-
-

eagleModule.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class EagleModule : public tensorrt_llm::runtime::SpeculativeDecodingModule#
-
-

Public Functions

-
-
-inline explicit EagleModule( - -
-
SizeType32 maxDraftPathLen,
-
SizeType32 maxDecodingDraftTokens,
-
SizeType32 numTransformersLayer,
-
SizeType32 maxNonLeafNodesPerLayer,
-
- -) noexcept#
-
- -
-
-inline explicit EagleModule() noexcept#
-
- -
-
-inline executor::EagleChoices const &getDefaultEagleChoices( - -
-
- -) const noexcept#
-
- -
-
-inline SizeType32 getNumTransformerLayers() const noexcept#
-
- -
-
-inline SizeType32 getMaxNonLeafNodesPerLayer() const noexcept#
-
- -
-
-

Private Members

-
-
-SizeType32 mNumTransformersLayer#
-
- -
-
-SizeType32 mMaxNonLeafNodesPerLayer#
-
- -
-
-executor::EagleChoices mDefaultEagleChoices = {{0}, {0, 0}, {1}, {0, 1}, {2}, {0, 0, 0}, {1, 0}, {0, 2}, {3}, {0, 3}, {4}, {0, 4}, {2, 0}, {0, 5}, {0, 0, 1}, {5}, {0, 6}, {6}, {0, 7}, {0, 1, 0}, {1, 1}, {7}, {0, 8}, {0, 0, 2}, {3, 0}, {0, 9}, {8}, {9}, {1, 0, 0}, {0, 2, 0}, {1, 2}, {0, 0, 3}, {4, 0}, {2, 1}, {0, 0, 4}, {0, 0, 5}, {0, 0, 0, 0}, {0, 1, 1}, {0, 0, 6}, {0, 3, 0}, {5, 0}, {1, 3}, {0, 0, 7}, {0, 0, 8}, {0, 0, 9}, {6, 0}, {0, 4, 0}, {1, 4}, {7, 0}, {0, 1, 2}, {2, 0, 0}, {3, 1}, {2, 2}, {8, 0}, {0, 5, 0}, {1, 5}, {1, 0, 1}, {0, 2, 1}, {9, 0}, {0, 6, 0}, {0, 0, 0, 1}, {1, 6}, {0, 7, 0}}#
-
- -
-
- -
- -
- -
-
-

runtimeDefaults.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-struct RuntimeDefaults#
-
-

Public Functions

-
-
-inline RuntimeDefaults( - -
-
std::optional<std::vector<SizeType32>> maxAttentionWindowVec,
-
std::optional<SizeType32> sinkTokenLength,
-
- -)#
-
- -
-
-RuntimeDefaults() = default#
-
- -
-
-

Public Members

-
-
-std::optional<std::vector<SizeType32>> maxAttentionWindowVec#
-
- -
-
-std::optional<SizeType32> sinkTokenLength#
-
- -
-
- -
- -
- -
-
-

decodingOutput.h#

-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
- -
-
-namespace runtime
-
-
-class DecodingOutput#
-
-

Public Types

-
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-

Public Functions

-
-
-DecodingOutput() = default#
-
- -
-
-

Public Members

-
-
-TensorPtr ids#
-

Mandatory parameters Previously generated token ids for all steps before DecodingInput.step, [BS, BM, MSL]

-
- -
-
-TensorPtr gatheredIds#
-

The tokens computed during the gatherTree step, [BS, BM, MSL] Necessary for “Streaming + Beam Search” mode since beam search kernels store ungathered tokens in ids.

-
- -
-
-TensorPtr newTokensSteps#
-

New tokens at each generated token of maxTokensPerStep, [maxTokensPerStep, BS, BM].

-
- -
-
-TensorPtr newTokens#
-

A view of newTokensSteps for the current token, [BS, BM].

-
- -
-
-std::vector<TensorPtr> newTokensVec#
-

A Vector of views on newTokensSteps for each token [BS, BM].

-
- -
-
-TensorPtr finishReasons#
-

Optional parameters FinishedState by decoding if any of the stop conditions are met or if DecodingInput.finished is true, [BS, BM]

-
- -
-
-TensorPtr finishedSum#
-

The sum of finished sequences per request, in pinned memory, [BS].

-
- -
-
-TensorPtr logProbs#
-

Mandatory parameters for Beam Search log-probility of generated tokens, [BS, BM, MSL], float

-
- -
-
-TensorPtr cumLogProbs#
-

Sum log-probility of all generated tokens, [BS, BM].

-
- -
-
-TensorPtr parentIds#
-

Index of the beam where the previous token is, [BS, BM, MSL].

-
- -
-
-TensorPtr lengths#
-

Total sequence lengths including padding, [BS, BM].

-
- -
-
-TensorPtr cacheIndirection#
-

K/V indirection for next generation step, [BS, BM, MSL].

-
- -
-
-TensorPtr logProbsTiled#
-

Buffer used to store the transpose of the logProbs, [MSL, BS, BM].

-
- -
-
-BeamHypotheses beamHypotheses#
-
- -
-
-std::optional<SpeculativeDecodingOutputs> speculativeDecodingOutputs#
-
- -
-
-std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers#
-
- -
-
-std::optional<LookaheadDecodingBuffers> lookaheadOutputs#
-
- -
-
-std::optional<EagleBuffers::Inputs> eagleBuffers#
-
- -
-
-

Public Static Attributes

-
-
-static float constexpr kNegativeInfinity = -1e20f#
-
- -
-
-
-class BeamHypotheses#
-
-

Public Functions

-
-
-void empty(BufferManager const &manager)#
-
- -
-
-void reshape( - -
-
SizeType32 batchSize,
-
SizeType32 beamWidth,
-
SizeType32 maxSequenceLength,
-
- -)#
-
- -
-
-void release()#
-
- -
-
-void init(BufferManager const &manager, TokenIdType endId)#
-
- -
-
-BeamHypotheses slice(SizeType32 batchIndex, SizeType32 size) const#
-
- -
-
-

Public Members

-
-
-TensorPtr outputIdsCBA#
-
- -
-
-TensorPtr logProbsCBA#
-
- -
-
-TensorPtr sequenceLengthsCBA#
-
- -
-
-TensorPtr cumLogProbsCBA#
-
- -
-
-TensorPtr normedScoresCBA#
-
- -
-
-TensorPtr numBeamsCBA#
-
- -
-
-TensorPtr minNormedScoresCBA#
-
- -
-
-TensorPtr batchDones#
-
- -
-
- -
-
-class SpeculativeDecodingOutputs#
-
-

Public Members

-
-
-TensorPtr nextDraftTokens#
-
- -
-
-TensorPtr nextDraftTokensLen#
-
- -
-
-TensorPtr prevDraftTokensLen#
-
- -
-
-TensorPtr acceptedTokensLen#
-
- -
-
-TensorPtr acceptedLengthsCumSum#
-
- -
-
-TensorPtr pathsOffsets#
-
- -
-
- -
- -
- -
- -
-
-

decodingInput.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class DecodingInput#
-
-#include <decodingInput.h>
-

Represents the inputs to the decoder.

-

This input type is assumed immutable. It represents whatever the decoder received initially, and can always be referred to as such.

-
-

Public Types

-
-
-using TensorConstPtr = ITensor::SharedConstPtr#
-
- -
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-

Public Functions

-
-
-DecodingInput() = default#
-
- -
-
-

Public Members

-
-
-SizeType32 step = {}#
-

Mandatory parameters The index of the decoding step we are on. Only used in Python runtime

-
- -
-
-SizeType32 maxLength = {}#
-

The maximum number of tokens to decode.

-
- -
-
-SizeType32 maxAttentionWindow = {}#
-

The maximum length of the attention window to consider while decoding.

-
- -
-
-SizeType32 sinkTokenLength = {}#
-

The number of tokens to use as attention sinks, https://arxiv.org/html/2309.17453v3.

-
- -
-
-SizeType32 batchSize = {}#
-

The number of samples in the batch.

-
- -
-
-std::vector<SizeType32> beamWidths#
-

The beam widths of each request, [batchSize].

-
- -
-
-SizeType32 maxStopWordsLen = {}#
-

The maximum value in the stopWordsLens tensor.

-
- -
-
-SizeType32 maxBadWordsLen = {}#
-

The maximum value in the badWordsLens tensor.

-
- -
-
-std::vector<TensorConstPtr> logitsVec#
-

The output of the model forward computation, a probability distribution over the vocabulary [batchSize][numGenTokens, beamWidth, vocabSizePadded] on gpu

-
- -
-
-TensorConstPtr endIds#
-

The end ids, [batchSize * beamWidth] on gpu.

-
- -
-
-TensorConstPtr batchSlots#
-

Address map of the linear batch id to to the seq slots, [batchSize] on pinned, int32_t.

-
- -
-
-TensorConstPtr finishReasons#
-

Optional parameters Finished states at current iteration (skip decoding step of a request if true), [batchSize, beamWidth] on gpu

-
- -
-
-TensorConstPtr sequenceLimitLength#
-

The maximum sequence length for each sequence in the batch, [batchSize] on gpu.

-
- -
-
-TensorConstPtr embeddingBias#
-
- -
-
-TensorConstPtr lengths#
-
- -
-
-std::vector<TensorPtr> badWordsLists#
-
- -
-
-TensorConstPtr badWordsPtrs#
-
- -
-
-TensorConstPtr badWordsLens#
-
- -
-
-std::vector<TensorPtr> stopWordsLists#
-
- -
-
-TensorConstPtr stopWordsPtrs#
-
- -
-
-TensorConstPtr stopWordsLens#
-
- -
-
-TensorConstPtr noRepeatNgramSize#
-
- -
-
-TensorPtr cacheIndirection#
-

Parameters for beam search KV cache index for beam search, [batchSize, beamWidth, maxSeqLen] on gpu

-
- -
-
-std::optional<std::vector<SizeType32>> generationSteps#
-

Steps of each request, for Variable-Beam-Width-Search, [batchSize].

-
- -
-
-std::optional<MedusaInputs> medusaInputs#
-
- -
-
-std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs#
-
- -
-
-std::optional<LookaheadInputs> lookaheadInputs#
-
- -
-
-std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs#
-
- -
-
-std::optional<EagleInputs> eagleInputs#
-
- -
-
-
-struct EagleInputs#
-
-

Public Members

-
-
-TensorConstPtr nextDraftTokens#
-
- -
-
-TensorConstPtr nextDraftLens#
-
- -
-
-TensorConstPtr nextDraftPaths#
-
- -
-
-TensorConstPtr lastDraftTokens#
-
- -
-
-TensorConstPtr lastDraftLens#
-
- -
-
-TensorConstPtr lastDraftPaths#
-
- -
-
-TensorConstPtr acceptedTokens#
-
- -
-
-TensorConstPtr acceptedLens#
-
- -
-
-TensorConstPtr acceptedPathIds#
-
- -
-
-TensorConstPtr chunkedContextNextTokens#
-
- -
-
-TensorConstPtr seqSlots#
-
- -
-
- -
-
-class ExplicitDraftTokensInputs#
-
-

Public Members

-
-
-TensorConstPtr nextDraftTokens#
-
- -
-
-TensorConstPtr nextFlatTokens#
-
- -
-
-TensorConstPtr nextDraftIndices#
-
- -
-
-TensorConstPtr nextDraftProbs#
-
- -
-
-TensorConstPtr lastDraftTokens#
-
- -
-
-TensorConstPtr lastDraftIndices#
-
- -
-
-TensorConstPtr masks#
-
- -
-
-TensorConstPtr packedPositionIds#
-
- -
-
-TensorConstPtr bestPathLengths#
-
- -
-
-TensorConstPtr bestPathIndices#
-
- -
-
-TensorConstPtr nextGenerationLengths#
-
- -
-
-TensorConstPtr lastPositionIdsBase#
-
- -
-
-TensorConstPtr lastGenerationLengths#
-
- -
-
-TensorConstPtr maxGenLengthDevice#
-
- -
-
-TensorConstPtr seqSlots#
-
- -
-
- -
-
-class ExternalDraftTokensInputs#
-
-

Public Members

-
-
-TensorPtr draftLogits#
-
- -
-
-TensorPtr draftLogitsHost#
-
- -
-
-TensorPtr draftProbs#
-
- -
-
-TensorPtr targetProbs#
-
- -
-
-TensorPtr numDraftTokens#
-
- -
-
-TensorPtr numDraftTokensHost#
-
- -
-
-TensorPtr draftTokenIds#
-
- -
-
-TensorPtr draftTokenIdsHost#
-
- -
-
-TensorPtr useDraftLogits#
-
- -
-
-TensorPtr useDraftLogitsHost#
-
- -
-
-SizeType32 step#
-
- -
-
-float constantThreshold#
-
- -
-
-bool useRandomAcceptanceThreshold#
-
- -
-
- -
-
-struct LookaheadInputs#
-
-

Public Members

-
-
-TensorPtr tokensPerStep#
-
- -
-
- -
-
-class MedusaInputs#
-
-

Public Members

-
-
-TensorConstPtr medusaPaths#
-

[batchSize, maxTokensPerStep, maxMedusaHeads + 1], on gpu

-
- -
-
-TensorConstPtr medusaTreeIds#
-

[batchSize, maxTokensPerStep], on gpu

-
- -
-
-std::vector<std::vector<TensorPtr>> medusaLogits#
-

[batchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded], on gpu

-
- -
-
-TensorPtr medusaCurTokensPerStep#
-

[batchSize], on gpu

-
- -
-
-TensorConstPtr medusaTargetTokensPerStep#
-

[batchSize], on gpu

-
- -
-
- -
- -
- -
- -
-
-

worldConfig.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class WorldConfig#
-
-

Public Functions

-
-
-explicit WorldConfig( - -
-
SizeType32 tensorParallelism = 1,
-
SizeType32 pipelineParallelism = 1,
-
SizeType32 contextParallelism = 1,
-
SizeType32 rank = 0,
-
SizeType32 gpusPerNode = kDefaultGpusPerNode,
-
std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt,
-
bool enableAttentionDP = false,
-
- -)#
-
- -
-
-inline SizeType32 constexpr getSize() const noexcept#
-
- -
-
-inline SizeType32 constexpr getTensorParallelism() const noexcept#
-
- -
-
-inline bool constexpr isTensorParallel() const noexcept#
-
- -
-
-inline SizeType32 constexpr getPipelineParallelism() const noexcept#
-
- -
-
-inline bool constexpr isPipelineParallel() const noexcept#
-
- -
-
-inline SizeType32 constexpr getContextParallelism() const noexcept#
-
- -
-
-inline bool constexpr isContextParallel() const noexcept#
-
- -
-
-inline SizeType32 constexpr getRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getGpusPerNode() const noexcept#
-
- -
-
-inline SizeType32 getGpusPerGroup() const noexcept#
-
- -
-
-inline SizeType32 getDevice() const noexcept#
-
- -
-
-inline SizeType32 getDeviceOf(SizeType32 rank) const noexcept#
-
- -
-
-inline SizeType32 constexpr getPipelineParallelRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getTensorParallelRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getContextParallelRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getLocalRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getNodeRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getNodeRankOf( - -
-
SizeType32 rank,
-
- -) const noexcept#
-
- -
-
-inline bool constexpr isFirstPipelineParallelRank() const noexcept#
-
- -
-
-inline bool constexpr isLastPipelineParallelRank() const noexcept#
-

Is my rank the last rank in its pipeline?

-
- -
-
-inline bool constexpr isFirstTensorParallelRank() const noexcept#
-
- -
-
-inline bool constexpr isFirstContextParallelRank() const noexcept#
-
- -
-
-inline SizeType32 constexpr getLastRank() const noexcept#
-
- -
-
-inline bool constexpr enableAttentionDP() const noexcept#
-
- -
-
-std::vector<SizeType32> getPipelineParallelGroup() const#
-
- -
-
-std::vector<SizeType32> getTensorParallelGroup() const#
-
- -
-
-std::vector<SizeType32> getContextParallelGroup() const#
-
- -
-
-bool validMpiConfig() const#
-
- -
-
-

Public Static Functions

-
-
-static WorldConfig mpi( - -
-
SizeType32 gpusPerNode = kDefaultGpusPerNode,
-
std::optional<SizeType32> tensorParallelism = std::nullopt,
-
std::optional<SizeType32> pipelineParallelism = std::nullopt,
-
std::optional<SizeType32> contextParallelism = std::nullopt,
-
std::optional<std::vector<SizeType32>> const &deviceIds = std::nullopt,
-
bool enableAttentionDP = false,
-
- -)#
-
- -
-
-

Public Static Attributes

-
-
-static SizeType32 constexpr kDefaultGpusPerNode = 1#
-
- -
-
-

Private Members

-
-
-SizeType32 mTensorParallelism#
-
- -
-
-SizeType32 mPipelineParallelism#
-
- -
-
-SizeType32 mContextParallelism#
-
- -
-
-SizeType32 mRank#
-
- -
-
-SizeType32 mGpusPerNode#
-
- -
-
-bool mEnableAttentionDP#
-
- -
-
-std::vector<SizeType32> mDeviceIds#
-
- -
-
- -
- -
- -
-
-

gptDecoderBatched.h#

-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
- -
-
-namespace runtime
-
-
-class GptDecoderBatched : public tensorrt_llm::runtime::IGptDecoderBatched#
-
-#include <gptDecoderBatched.h>
-

GPT decoder class with support for in-flight batching.

-
-

Public Types

-
-
-using CudaStreamPtr = std::shared_ptr<CudaStream>#
-
- -
-
-using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
-
- -
-
-using RequestVector = std::vector<LlmRequestPtr>#
-
- -
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-

Public Functions

-
-
-explicit GptDecoderBatched(CudaStreamPtr stream)#
-
- -
-
-virtual void setup( - -
-
executor::DecodingMode const &mode,
-
SizeType32 maxNumSequences,
-
SizeType32 maxBeamWidth,
-
nvinfer1::DataType dtype,
-
ModelConfig const &modelConfig,
-
WorldConfig const &worldConfig,
-
- -) override#
-

Setup the decoder before calling forward()

-
- -
-
-virtual void disableLookahead( - -
-
RequestVector const &genRequests,
-
TensorPtr const &batchSlots,
-
- -) override#
-

Disable Lookahead decoding.

-
- -
-
-virtual CudaEvent forwardAsync( - -
-
decoder::DecoderState const &decoderState,
-
decoder_batch::Input const &input,
-
- -) override#
-

Run one step for all requests without blocking the host process and return the token for synchronization.

-
- -
-
-virtual void forward( - -
-
decoder::DecoderState const &decoderState,
-
decoder_batch::Input const &input,
-
- -) override#
-

Run one step for all requests and wait for completion on the host.

-
- -
-
-virtual CudaEvent finalize( - -
-
decoder::DecoderState const &decoderState,
-
SizeType32 batchSlot,
-
SamplingConfig const &samplingConfig,
-
bool streaming,
-
- -) const override#
-

Gather final beam search results for request batchSlot. Result will only be available after event returned.

-
- -
-
-inline CudaStreamPtr getDecoderStream() const#
-
- -
-
-inline IGptDecoder &getUnderlyingDecoder() const#
-
- -
-
-inline BufferManager const &getBufferManager() const#
-
- -
-
-

Private Types

-
-
-using GptDecoderPtr = std::unique_ptr<IGptDecoder>#
-
- -
-
-

Private Functions

-
-
-void forwardDispatch( - -
-
decoder::DecoderState const &decoderState,
-
decoder_batch::Input const &input,
-
- -)#
-

Calls decoders for tokens per engine step.

-
- -
-
-

Private Members

-
-
-CudaStreamPtr mRuntimeStream#
-
- -
-
-CudaStreamPtr mDecoderStream#
-
- -
-
-BufferManager mBufferManager#
-
- -
-
-GptDecoderPtr mDecoder#
-
- -
-
- -
- -
- -
-
-

explicitDraftTokensBuffers.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class ExplicitDraftTokensBuffers#
-
-

Public Types

-
-
-using SizeType32 = runtime::SizeType32#
-
- -
-
-using ITensor = runtime::ITensor#
-
- -
-
-using BufferPtr = runtime::IBuffer::SharedPtr#
-
- -
-
-using TensorPtr = runtime::ITensor::SharedPtr#
-
- -
-
-using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
-
- -
-
-

Public Functions

-
-
-ExplicitDraftTokensBuffers( - -
-
SizeType32 maxBatchSize,
-
SizeType32 maxBeamWidth,
-
runtime::BufferManager const &manager,
-
runtime::ModelConfig const &modelConfig,
-
runtime::WorldConfig const &worldConfig,
-
- -)#
-
- -
-
-void reshape( - -
-
SizeType32 numCtxSequences,
-
SizeType32 numGenSequences,
-
runtime::ModelConfig const &modelConfig,
-
- -)#
-
- -
-
-void setFromInputs( - -
-
SizeType32 numCtxSequences,
-
SizeType32 numGenSequences,
-
runtime::ITensor const &requestTypes,
-
ITensor const &seqSlots,
-
ExplicitDraftTokensBuffers::Inputs const &decoderBuffers,
-
ITensor const &contextPositionIds,
-
runtime::ModelConfig const &modelConfig,
-
runtime::WorldConfig const &worldConfig,
-
runtime::BufferManager const &manager,
-
runtime::CudaStream const &stream,
-
- -) const#
-
- -
-
-void insertInputTensors( - -
-
TensorMap &inputBuffers,
-
TensorMap &outputBuffers,
-
runtime::WorldConfig const &worldConfig,
-
- -) const#
-
- -
-
-

Public Members

-
-
-tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs engineInputs#
-
- -
-
-class tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs engineOutputs#
-
- -
-
-std::size_t scanTempStorageBytes = {0}#
-
- -
-
-BufferPtr scanTempStorage#
-
- -
-
-TensorPtr cumSumGenerationLengths#
-
- -
-
-

Private Functions

-
-
-template<typename T>
void setFromInputs( - -
-
SizeType32 numCtxSequences,
-
SizeType32 numGenSequences,
-
SizeType32 vocabSizePadded,
-
ITensor const &seqSlots,
-
ExplicitDraftTokensBuffers::Inputs const &draftBuffers,
-
ITensor const &contextPositionIds,
-
runtime::ExplicitDraftTokensModule const &explicitDraftTokensModule,
-
runtime::CudaStream const &stream,
-
- -) const#
-
- -
-
-
-class EngineInputs : public tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs#
-
-

Public Members

-
-
-TensorPtr requestTypesDevice#
-

[numSequences], on gpu

-
- -
-
-TensorPtr positionOffsets#
-

[numGenSequences]

-
- -
-
- -
-
-class EngineOutputs#
-
-

Public Members

-
-
-TensorPtr nextGenerationLengths#
-

[batchSize]

-
- -
-
-TensorPtr nextPositionOffsets#
-

[batchSize]

-
- -
-
-TensorPtr masks#
-

[batchSize, maxDecodingTokens, maxDecodingTokens], bool

-
- -
-
-TensorPtr nextDraftTokens#
-

[batchSize, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr nextDraftIndices#
-

[batchSize, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr nextDraftProbs#
-

[batchSize, maxNumPaths, maxDraftPathLen, vocabSize]

-
- -
-
-TensorPtr nextFlatTokens#
-

[batchSize * maxDecodingTokens]

-
- -
-
-TensorPtr bestPathLengths#
-

[batchSize]

-
- -
-
-TensorPtr bestPathIndices#
-

[batchSize]

-
- -
-
-TensorPtr maxGenToken#
-

[1]

-
- -
-
-TensorPtr totalGenToken#
-

[1]

-
- -
-
-TensorPtr packedPositionIds#
-

[batchSize * maxDecodingTokens]

-
- -
-
- -
-
-class Inputs#
-

Subclassed by tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs

-
-

Public Functions

-
-
-void create( - -
-
SizeType32 maxNumSequences,
-
runtime::BufferManager const &manager,
-
runtime::ModelConfig const &modelConfig,
-
runtime::WorldConfig const &worldConfig,
-
- -)#
-
- -
-
-

Public Members

-
-
-TensorPtr temperatures#
-

[maxBatchSize]

-
- -
-
-TensorPtr positionIdsBase#
-

[maxBatchSize]

-
- -
-
-TensorPtr generationLengths#
-

[maxBatchSize] or [numGenSequences]

-
- -
-
-TensorPtr randomDataSample#
-

[maxBatchSize]

-
- -
-
-TensorPtr randomDataValidation#
-

[maxBatchSize, maxNumPaths, maxPathDraftLen] or [numGenSequences, maxNumPaths, maxPathDraftLen]

-
- -
-
-TensorPtr draftTokens#
-

[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr draftIndices#
-

[maxBatchSize, maxNumPaths, maxPathLen] or [numGenSequences, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr draftProbs#
-

[maxBatchSize, maxNumPaths, maxPathDraftLen, vocabSize] or [numGenSequences, maxNumPaths, maxPathDraftLen, vocabSize]

-
- -
-
-TensorPtr packedMasks#
-

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

-
- -
-
-TensorPtr positionIds#
-

[maxBatchSize] or [numGenSequences]

-
- -
-
-TensorPtr maxGenLengthHost#
-
- -
-
-TensorPtr generationLengthsHost#
-
- -
-
-TensorPtr useSpecDecoding#
-
- -
-
- -
- -
- -
- -
-
-

bufferManager.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class BufferManager#
-
-#include <bufferManager.h>
-

A helper class for managing memory on host and device.

-
-

Public Types

-
-
-using IBufferPtr = IBuffer::UniquePtr#
-
- -
-
-using ITensorPtr = ITensor::UniquePtr#
-
- -
-
-using CudaStreamPtr = std::shared_ptr<CudaStream>#
-
- -
-
-using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>#
-
- -
-
-

Public Functions

-
-
-explicit BufferManager(CudaStreamPtr stream, bool trimPool = false)#
-

Construct a BufferManager.

-
-
Parameters:
-

cudaStream[in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

-
-
-
- -
-
-inline ~BufferManager()#
-

Destructor.

-
- -
-
-IBufferPtr gpu( - -
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -) const#
-

Allocates an IBuffer of the given size on the GPU, using cudaMallocAsync.

-
- -
-
-ITensorPtr gpu( - -
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -) const#
-

Allocates an ITensor of the given dimensions on the GPU, using cudaMallocAsync.

-
- -
-
-IBufferPtr allocate( - -
-
MemoryType memoryType,
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -) const#
-

Allocates an IBuffer of the given size and memory type.

-
- -
-
-ITensorPtr allocate( - -
-
MemoryType memoryType,
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -) const#
-

Allocates an ITensor of the given dimensions and memory type.

-
- -
-
-inline IBufferPtr emptyBuffer( - -
-
MemoryType memoryType,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -) const#
-

Create an empty IBuffer of the given memory type. It may be resized later.

-
- -
-
-inline ITensorPtr emptyTensor( - -
-
MemoryType memoryType,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -) const#
-

Create an empty ITensor of the given memory type. It may be reshaped later.

-
- -
-
-void setMem(IBuffer &buffer, int32_t value) const#
-

Set the contents of the given buffer to value.

-
- -
-
-void setZero(IBuffer &buffer) const#
-

Set the contents of the given buffer to zero.

-
- -
-
-void copy(void const *src, IBuffer &dst, MemoryType srcType) const#
-

Copy src to dst.

-
- -
-
-void copy(IBuffer const &src, void *dst, MemoryType dstType) const#
-

Copy src to dst.

-
- -
-
-inline void copy(void const *src, IBuffer &dst) const#
-

Copy src to dst.

-
- -
-
-inline void copy(IBuffer const &src, void *dst) const#
-

Copy src to dst.

-
- -
-
-void copy(IBuffer const &src, IBuffer &dst) const#
-

Copy src to dst.

-
- -
-
-IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const#
-

Copy src into a new IBuffer with a potentially different memory type.

-
- -
-
-ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const#
-

Copy src into a new ITensor with a potentially different memory type.

-
- -
-
-template<typename T>
inline IBufferPtr copyFrom( - -
-
std::vector<T> const &src,
-
MemoryType memoryType,
-
- -) const#
-

Copy src into a new IBuffer with a potentially different memory type.

-
- -
-
-template<typename T>
inline ITensorPtr copyFrom( - -
-
T *src,
-
nvinfer1::Dims dims,
-
MemoryType memoryType,
-
- -) const#
-

Copy src into a new ITensor with a potentially different memory type.

-
- -
-
-template<typename T>
inline ITensorPtr copyFrom( - -
-
std::vector<T> const &src,
-
nvinfer1::Dims dims,
-
MemoryType memoryType,
-
- -) const#
-

Copy src into a new ITensor with a potentially different memory type.

-
- -
-
-CudaStream const &getStream() const#
-

Get the underlying cuda stream.

-
- -
-
-std::size_t memoryPoolReserved() const#
-

The current size of the memory reserved by the memory pool.

-
- -
-
-std::size_t memoryPoolUsed() const#
-

The current size of the memory used by the memory pool.

-
- -
-
-std::size_t memoryPoolFree() const#
-

The current size of the memory free in the memory pool.

-
- -
-
-void memoryPoolTrimTo(std::size_t size)#
-

Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

-
- -
-
-

Public Static Functions

-
-
-static IBufferPtr gpuSync( - -
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates an IBuffer of the given size on the GPU, using cudaMalloc.

-
- -
-
-static ITensorPtr gpuSync( - -
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates an ITensor of the given dimensions on the GPU, using cudaMalloc.

-
- -
-
-static IBufferPtr cpu( - -
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates an IBuffer of the given size on the CPU.

-
- -
-
-static ITensorPtr cpu( - -
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates an ITensor of the given dimensions on the CPU.

-
- -
-
-static IBufferPtr pinned( - -
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates a pinned IBuffer of the given size on the CPU.

-
- -
-
-static ITensorPtr pinned( - -
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates a pinned ITensor of the given dimensions on the CPU.

-
- -
-
-static IBufferPtr pinnedPool( - -
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

-
- -
-
-static ITensorPtr pinnedPool( - -
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

-
- -
-
-static IBufferPtr managed( - -
-
std::size_t size,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates an IBuffer of the given size in UVM.

-
- -
-
-static ITensorPtr managed( - -
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type = kBYTE_TYPE,
-
- -)#
-

Allocates an ITensor of the given dimensions in UVM.

-
- -
-
-static ITensorPtr ipcNvls( - -
-
std::set<int> ranks,
-
nvinfer1::Dims dims,
-
nvinfer1::DataType type,
-
- -)#
-

Allocates an ITensor of the given dimensions for NVLS.

-
- -
-
-

Public Static Attributes

-
-
-static auto constexpr kBYTE_TYPE = nvinfer1::DataType::kUINT8#
-
- -
-
-

Private Members

-
-
-CudaStreamPtr mStream#
-
- -
-
-CudaMemPoolPtr mPool#
-
- -
-
-bool const mTrimPool#
-
- -
-
-

Friends

-
-
-friend class ::BufferManagerTest
-
- -
-
- -
- -
- -
-
-

loraModule.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-

Functions

-
-
-inline std::ostream &operator<<( - -
-
std::ostream &output,
-
LoraModule const &module,
-
- -)#
-
- -
-
-
-class LoraModule#
-
-

Public Types

-
-
-enum class ModuleType : SizeType32#
-

Values:

-
-
-enumerator kINVALID#
-
- -
-
-enumerator kATTN_QKV#
-
- -
-
-enumerator kATTN_Q#
-
- -
-
-enumerator kATTN_K#
-
- -
-
-enumerator kATTN_V#
-
- -
-
-enumerator kATTN_DENSE#
-
- -
-
-enumerator kMLP_H_TO_4H#
-
- -
-
-enumerator kMLP_4H_TO_H#
-
- -
-
-enumerator kMLP_GATE#
-
- -
-
-enumerator kCROSS_ATTN_QKV#
-
- -
-
-enumerator kCROSS_ATTN_Q#
-
- -
-
-enumerator kCROSS_ATTN_K#
-
- -
-
-enumerator kCROSS_ATTN_V#
-
- -
-
-enumerator kCROSS_ATTN_DENSE#
-
- -
-
-enumerator kMOE_H_TO_4H#
-
- -
-
-enumerator kMOE_4H_TO_H#
-
- -
-
-enumerator kMOE_GATE#
-
- -
-
-enumerator kMOE_ROUTER#
-
- -
-
-enumerator kMLP_ROUTER#
-
- -
-
-enumerator kMLP_GATE_UP#
-
- -
- -
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-

Public Functions

-
-
-inline explicit constexpr LoraModule( - -
-
ModuleType const &t,
-
SizeType32 inDim,
-
SizeType32 outDim,
-
bool inDimFirst,
-
bool outDimFirst,
-
SizeType32 inTpSplitDim,
-
SizeType32 outTpSplitDim,
-
- -) noexcept#
-
- -
-
-inline explicit constexpr LoraModule() noexcept#
-
- -
-
-explicit constexpr LoraModule(LoraModule const &o) = default#
-
- -
-
-constexpr LoraModule &operator=(LoraModule const &o) = default#
-
- -
-
-inline SizeType32 constexpr flattenedInOutSize( - -
-
SizeType32 adapterSize,
-
bool isDora,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr inSize( - -
-
SizeType32 adapterSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr outSize( - -
-
SizeType32 adapterSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localInSize( - -
-
SizeType32 adapterSize,
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localOutSize( - -
-
SizeType32 adapterSize,
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localScalesSize( - -
-
SizeType32 tpSize,
-
bool isDora,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localInDim( - -
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localOutDim( - -
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localInAdapterSize( - -
-
SizeType32 adapterSize,
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localOutAdapterSize( - -
-
SizeType32 adapterSize,
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localInOutSize( - -
-
SizeType32 adapterSize,
-
SizeType32 tpSize,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr localTotalSize( - -
-
SizeType32 adapterSize,
-
SizeType32 tpSize,
-
bool isDora,
-
- -) const noexcept#
-
- -
-
-inline SizeType32 constexpr value() const noexcept#
-
- -
-
-inline std::string_view constexpr name() const noexcept#
-
- -
-
-inline SizeType32 constexpr inDim() const noexcept#
-
- -
-
-inline SizeType32 constexpr outDim() const noexcept#
-
- -
-
-inline bool constexpr inDimFirst() const noexcept#
-
- -
-
-inline bool constexpr outDimFirst() const noexcept#
-
- -
-
-inline SizeType32 constexpr inTpSplitDim() const noexcept#
-
- -
-
-inline SizeType32 constexpr outTpSplitDim() const noexcept#
-
- -
-
-

Public Static Functions

-
-
-static std::vector<LoraModule> createLoraModules( - -
-
std::vector<std::string> const &loraModuleNames,
-
SizeType32 hiddenSize,
-
SizeType32 mlpHiddenSize,
-
SizeType32 numAttentionHeads,
-
SizeType32 numKvAttentionHeads,
-
SizeType32 attentionHeadSize,
-
SizeType32 tpSize,
-
SizeType32 numExperts,
-
- -)#
-
- -
-
-static inline ModuleType constexpr toModuleType( - -
-
std::string_view const &name,
-
- -)#
-
- -
-
-static inline std::string_view constexpr toModuleName( - -
-
ModuleType t,
-
- -) noexcept#
-
- -
-
-static inline std::string_view constexpr toModuleName(SizeType32 id)#
-
- -
-
-

Private Members

-
-
-ModuleType mType#
-
- -
-
-SizeType32 mInDim#
-
- -
-
-SizeType32 mOutDim#
-
- -
-
-bool mInDimFirst#
-
- -
-
-bool mOutDimFirst#
-
- -
-
-SizeType32 mInTpSplitDim#
-
- -
-
-SizeType32 mOutTpSplitDim#
-
- -
-
- -
- -
- -
-
-

eagleBuffers.h#

-
-
-namespace tensorrt_llm
-
-
-namespace batch_manager
-
- -
-
-namespace runtime
-
-
-class EagleBuffers#
-
-

Public Types

-
-
-using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>#
-
- -
-
-using RequestVector = std::vector<LlmRequestPtr>#
-
- -
-
-using SizeType32 = runtime::SizeType32#
-
- -
-
-using ITensor = runtime::ITensor#
-
- -
-
-using BufferPtr = runtime::IBuffer::SharedPtr#
-
- -
-
-using TensorPtr = runtime::ITensor::SharedPtr#
-
- -
-
-using TensorMap = runtime::StringPtrMap<runtime::ITensor>#
-
- -
-
-

Public Functions

-
-
-EagleBuffers( - -
-
SizeType32 maxBatchSize,
-
SizeType32 maxBeamWidth,
-
runtime::BufferManager const &manager,
-
runtime::ModelConfig const &modelConfig,
-
runtime::WorldConfig const &worldConfig,
-
executor::DecodingConfig const &decodingConfig,
-
- -)#
-
- -
-
-void reshape( - -
-
SizeType32 numCtxSequences,
-
SizeType32 numGenSequences,
-
runtime::ModelConfig const &modelConfig,
-
- -)#
-
- -
-
-void setFromInputs( - -
-
RequestVector const &contextRequests,
-
RequestVector const &genRequests,
-
runtime::ITensor const &requestTypes,
-
ITensor const &seqSlots,
-
EagleBuffers::Inputs const &decoderBuffers,
-
runtime::BufferManager const &manager,
-
runtime::ModelConfig const &modelConfig,
-
runtime::WorldConfig const &worldConfig,
-
- -) const#
-
- -
-
-void insertInputTensors( - -
-
TensorMap &inputBuffers,
-
TensorMap &outputBuffers,
-
runtime::WorldConfig const &worldConfig,
-
- -) const#
-
- -
-
-

Public Members

-
-
-Inputs engineInputs#
-
- -
-
-class tensorrt_llm::runtime::EagleBuffers::EngineOutputs engineOutputs#
-
- -
-
-

Private Functions

-
-
-template<typename T>
void setFromInputs( - -
-
RequestVector const &contextRequests,
-
RequestVector const &genRequests,
-
SizeType32 vocabSizePadded,
-
ITensor const &seqSlots,
-
EagleBuffers::Inputs const &draftBuffers,
-
runtime::EagleModule const &eagleModule,
-
runtime::BufferManager const &manager,
-
- -) const#
-
- -
-
-

Private Members

-
-
-std::size_t scanReduceTempStorageBytes = {0}#
-
- -
-
-float mDefaultPosteriorThreshold = {0.09f}#
-
- -
-
-bool mDoGreedySampling = {true}#
-
- -
-
-BufferPtr scanReduceTempStorage#
-
- -
-
-TensorPtr cumSumGenerationLengths#
-
- -
-
-TensorPtr maxGenerationLength#
-
- -
-
-TensorPtr chunkedContextNextTokensHost#
-
- -
-
-TensorPtr greedySamplingHost#
-
- -
-
-TensorPtr posteriorAlphaHost#
-
- -
-
-TensorPtr posteriorThresholdHost#
-
- -
-
-
-class EngineOutputs#
-
-

Public Members

-
-
-TensorPtr nextDraftTokens#
-

[batchSize, maxDecodingDraftTokens]

-
- -
-
-TensorPtr nextDraftLens#
-

[batchSize]

-
- -
-
-TensorPtr nextDraftPaths#
-

[batchSize, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr acceptedTokens#
-

[batchSize, maxPathLen]

-
- -
-
-TensorPtr acceptedLens#
-

[batchSize]

-
- -
-
-TensorPtr acceptedPaths#
-

[batchSize]

-
- -
-
-TensorPtr chunkedContextNextTokens#
-

[batchSize]

-
- -
-
- -
-
-class Inputs#
-
-

Public Functions

-
-
-void create( - -
-
SizeType32 maxNumSequences,
-
BufferManager const &manager,
-
ModelConfig const &modelConfig,
-
WorldConfig const &worldConfig,
-
- -)#
-
- -
-
-

Public Members

-
-
-TensorPtr temperatures#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr posteriorAlpha#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr posteriorThreshold#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr randomDataSample#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr randomDataValidation#
-

[maxBatchSize, maxDecodingTokens] or [numSequences, maxDecodingTokens]

-
- -
-
-TensorPtr draftTokens#
-

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

-
- -
-
-TensorPtr draftLens#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr draftPaths#
-

[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr draftPathsHost#
-

[maxBatchSize, maxNumPaths, maxPathLen] or [numSequences, maxNumPaths, maxPathLen]

-
- -
-
-TensorPtr specDecodingGenerationLengths#
-

[maxBatchSize] or [numGenSequences]

-
- -
-
-TensorPtr specDecodingGenerationLengthsHost#
-

[maxBatchSize] or [numGenSequences]

-
- -
-
-TensorPtr specDecodingPackedMasks#
-

[maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)] or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]

-
- -
-
-TensorPtr specDecodingPositionOffsets#
-

[maxBatchSize] or [numGenSequences]

-
- -
-
-TensorPtr eagleNetCtxRequestTypesHost#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr eagleNetCtxContextLengthsHost#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr eagleNetCtxPastKeyValueLengthsHost#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr eagleNetGenRequestTypesHost#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr eagleNetGenContextLengthsHost#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr eagleNetGenPastKeyValueLengthsHost#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr inputGenTokensHost#
-

[maxBatchSize * maxDecodingTokens] or [numSequences * maxDecodingTokens]

-
- -
-
-TensorPtr chunkedContextNextTokens#
-

[maxBatchSize] or [numSequences]

-
- -
-
-TensorPtr useSpecDecoding#
-

[1]

-
- -
-
-TensorPtr useDynamicTreeHost#
-

[1]

-
- -
-
-TensorPtr dynamicTreeMaxTopKHost#
-

[1]

-
- -
-
-TensorPtr prevScores#
-

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

-
- -
-
-TensorPtr currentExpandIndices#
-

[maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]

-
- -
-
-TensorPtr allLayersScores#
-

[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]

-
- -
-
-TensorPtr allLayersDraftTokenIds#
-

[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]

-
- -
-
-TensorPtr allLayersDraftTokenIdsPredecessor#
-

[maxBatchSize, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens] or [numSequences, numEagleLayers, maxDecodingDraftTokens * maxDecodingDraftTokens]

-
- -
-
- -
- -
- -
- -
-
-

speculativeDecodingMode.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class SpeculativeDecodingMode#
-
-

Public Types

-
-
-using UnderlyingType = std::uint8_t#
-
- -
-
-

Public Functions

-
-
-inline bool constexpr isNone() const#
-
- -
-
-inline bool constexpr isDraftTokensExternal() const#
-
- -
-
-inline bool constexpr isMedusa() const#
-
- -
-
-inline bool constexpr isLookaheadDecoding() const#
-
- -
-
-inline bool constexpr isExplicitDraftTokens() const#
-
- -
-
-inline bool constexpr isEagle() const#
-
- -
-
-inline bool constexpr updatesPositionIds() const#
-
- -
-
-inline bool constexpr requiresAttentionMask() const#
-
- -
-
-inline bool constexpr predictsDraftTokens() const#
-
- -
-
-inline bool constexpr needsKVCacheRewind() const#
-
- -
-
-inline bool constexpr variableDraftLength() const#
-
- -
-
-inline bool constexpr hasDraftLogits() const#
-
- -
-
-inline bool constexpr needsDecoderPrologue() const#
-
- -
-
-inline bool operator==(SpeculativeDecodingMode const &other) const#
-
- -
-
-inline explicit constexpr SpeculativeDecodingMode( - -
-
UnderlyingType state,
-
- -)#
-
- -
-
-

Public Static Functions

-
-
-static inline auto constexpr None()#
-
- -
-
-static inline auto constexpr DraftTokensExternal()#
-
- -
-
-static inline auto constexpr Medusa()#
-
- -
-
-static inline auto constexpr LookaheadDecoding()#
-
- -
-
-static inline auto constexpr ExplicitDraftTokens()#
-
- -
-
-static inline auto constexpr Eagle()#
-
- -
-
-

Private Functions

-
-
-inline bool constexpr anyBitSet(UnderlyingType bits) const#
-
- -
-
-inline bool constexpr allBitSet(UnderlyingType bits) const#
-
- -
-
-

Private Members

-
-
-UnderlyingType mState = {kNone}#
-
- -
-
-

Private Static Attributes

-
-
-static UnderlyingType constexpr kNone = {1U << 0U}#
-
- -
-
-static UnderlyingType constexpr kDraftTokensExternal = {1U << 1U}#
-
- -
-
-static UnderlyingType constexpr kMedusa = {1U << 2U}#
-
- -
-
-static UnderlyingType constexpr kLookaheadDecoding = {1U << 3U}#
-
- -
-
-static UnderlyingType constexpr kExplicitDraftTokens = {1U << 4U}#
-
- -
-
-static UnderlyingType constexpr kEagle = {1U << 5U}#
-
- -
-
- -
- -
- -
-
-

promptTuningParams.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-template<typename TTensor>
class GenericPromptTuningParams#
-
-

Public Types

-
-
-using TensorPtr = TTensor#
-
- -
-
-using SizeType32 = tensorrt_llm::runtime::SizeType32#
-
- -
-
-

Public Functions

-
-
-inline explicit GenericPromptTuningParams( - -
-
TensorPtr embeddingTable = TensorPtr(),
-
TensorPtr tasks = TensorPtr(),
-
TensorPtr vocabSize = TensorPtr(),
-
- -)#
-
- -
-
-

Public Members

-
-
-TensorPtr embeddingTable#
-
- -
-
-TensorPtr tasks#
-
- -
-
-TensorPtr vocabSize#
-
- -
-
-std::vector<bool> promptTuningEnabled#
-
- -
-
- -
-
-class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>#
-
-

Public Types

-
-
-using TensorPtr = ITensor::SharedPtr#
-
- -
-
-using SizeType32 = GenericPromptTuningParams::SizeType32#
-
- -
-
-

Public Functions

-
-
-inline explicit PromptTuningParams( - -
-
TensorPtr embeddingTable = nullptr,
-
TensorPtr tasks = nullptr,
-
TensorPtr vocabSize = nullptr,
-
- -)#
-
- -
-
-void fillTasksTensor( - -
-
TensorPtr tasksHost,
-
SizeType32 batchSize,
-
SizeType32 numContextRequests,
-
std::vector<SizeType32> const &reqBeamWidths,
-
std::vector<SizeType32> const &reqPromptLengths,
-
BufferManager const &manager,
-
bool packedInput,
-
- -)#
-
- -
-
- -
- -
- -
-
-

gptDecoder.h#

-
-
-namespace tensorrt_llm
-
-
-namespace layers#
-
- -
-
-namespace runtime
-
-

Functions

-
-
-inline runtime::ITensor::SharedConstPtr getDefaultBatchSlots( - -
-
runtime::SizeType32 batchSize,
-
- -)#
-

Helper function to produce batch slots [0, 1, …, batchSize - 1] for paths that do not explicitly provide batch slots to the decoder.

-
- -
-
-
-template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder#
-
-

Public Types

-
-
-using CudaStreamPtr = BufferManager::CudaStreamPtr#
-
- -
-
-using TensorPtr = std::shared_ptr<ITensor>#
-
- -
-
-

Public Functions

-
-
-GptDecoder( - -
-
executor::DecodingMode const &mode,
-
size_t maxNumSequences,
-
size_t maxBeamWidth,
-
size_t vocabSize,
-
size_t vocabSizePadded,
-
CudaStreamPtr const &stream,
-
std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr,
-
- -)#
-
- -
-
-virtual void setup( - -
-
SamplingConfig const &samplingConfig,
-
size_t batchSize,
-
TensorConstPtr const &batchSlots,
-
std::optional<DecodingOutput> const &output = std::nullopt,
-
std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
-
std::optional<std::vector<TensorConstPtr>> const &lookaheadPrompt = std::nullopt,
-
std::optional<std::vector<executor::LookaheadDecodingConfig>> const &lookaheadAlgoConfigs = std::nullopt,
-
- -) override#
-
-
Parameters:
-

explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.

-
-
-
- -
-
-virtual void forwardAsync( - -
-
DecodingOutput &output,
-
DecodingInput const &input,
-
- -) override#
-
- -
-
-virtual void forwardSync( - -
-
DecodingOutput &output,
-
DecodingInput const &input,
-
- -) override#
-
- -
-
-inline virtual SamplingConfig const &getSamplingConfig() override#
-
- -
-
-virtual void disableLookahead( - -
-
std::optional<SamplingConfig> const &samplingConfig,
-
SizeType32 batchSize,
-
TensorConstPtr batchSlots,
-
- -) override#
-
- -
-
-

Private Members

-
-
-std::shared_ptr<BufferManager> mManager#
-
- -
-
-std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer#
-
- -
-
-std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace#
-
- -
-
-SamplingConfig mSamplingConfig#
-
- -
-
-size_t mMaxNumSequences#
-
- -
-
-size_t mVocabSize#
-
- -
-
-size_t mVocabSizePadded#
-
- -
-
-executor::DecodingMode mDecodingMode#
-
- -
-
- -
-
-class IGptDecoder#
-

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

-
-

Public Types

-
-
-using TensorPtr = runtime::ITensor::SharedPtr#
-
- -
-
-using TensorConstPtr = runtime::ITensor::SharedConstPtr#
-
- -
-
-

Public Functions

-
-
-virtual ~IGptDecoder() = default#
-
- -
-
-virtual void setup( - -
-
SamplingConfig const &samplingConfig,
-
size_t batchSize,
-
TensorConstPtr const &batchSlots,
-
std::optional<DecodingOutput> const &output = std::nullopt,
-
std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
-
std::optional<std::vector<TensorConstPtr>> const &lookaheadPrompt = std::nullopt,
-
std::optional<std::vector<executor::LookaheadDecodingConfig>> const &lookaheadAlgoConfigs = std::nullopt,
-
- -) = 0#
-
-
Parameters:
-

explicitDraftTokensDType – is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.

-
-
-
- -
-
-virtual void forwardAsync( - -
-
DecodingOutput &output,
-
DecodingInput const &input,
-
- -) = 0#
-
- -
-
-virtual void forwardSync( - -
-
DecodingOutput &output,
-
DecodingInput const &input,
-
- -) = 0#
-
- -
-
-virtual SamplingConfig const &getSamplingConfig() = 0#
-
- -
-
-virtual void disableLookahead( - -
-
std::optional<SamplingConfig> const &samplingConfig,
-
SizeType32 batchSize,
-
TensorConstPtr batchSlots,
-
- -) = 0#
-
- -
-
-

Public Static Functions

-
-
-static inline std::unique_ptr<IGptDecoder> create( - -
-
executor::DecodingMode const &mode,
-
nvinfer1::DataType dtype,
-
size_t maxNumSequences,
-
size_t maxBeamWidth,
-
size_t vocabSize,
-
size_t vocabSizePadded,
-
BufferManager::CudaStreamPtr const &stream,
-
std::shared_ptr<SpeculativeDecodingModule const> const &speculativeDecodingModule = nullptr,
-
- -)#
-
- -
-
- -
- -
- -
-
-

memoryCounters.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class MemoryCounters#
-
-

Public Types

-
-
-using SizeType32 = std::size_t#
-
- -
-
-using DiffType = std::ptrdiff_t#
-
- -
-
-

Public Functions

-
-
-MemoryCounters() = default#
-
- -
-
-inline SizeType32 getGpu() const#
-
- -
-
-inline SizeType32 getCpu() const#
-
- -
-
-inline SizeType32 getPinned() const#
-
- -
-
-inline SizeType32 getUVM() const#
-
- -
-
-inline SizeType32 getPinnedPool() const#
-
- -
-
-inline DiffType getGpuDiff() const#
-
- -
-
-inline DiffType getCpuDiff() const#
-
- -
-
-inline DiffType getPinnedDiff() const#
-
- -
-
-inline DiffType getUVMDiff() const#
-
- -
-
-inline DiffType getPinnedPoolDiff() const#
-
- -
-
-template<MemoryType T>
inline void allocate(SizeType32 size)#
-
- -
-
-void allocate(MemoryType memoryType, SizeType32 size)#
-
- -
-
-template<MemoryType T>
inline void deallocate(SizeType32 size)#
-
- -
-
-void deallocate(MemoryType memoryType, SizeType32 size)#
-
- -
-
-std::string toString() const#
-
- -
-
-

Public Static Functions

-
-
-static MemoryCounters &getInstance()#
-
- -
-
-static std::string bytesToString(SizeType32 bytes, int precision = 2)#
-
- -
-
-static std::string bytesToString(DiffType bytes, int precision = 2)#
-
- -
-
-

Private Members

-
-
-std::atomic<SizeType32> mGpu = {}#
-
- -
-
-std::atomic<SizeType32> mCpu = {}#
-
- -
-
-std::atomic<SizeType32> mPinned = {}#
-
- -
-
-std::atomic<SizeType32> mUVM = {}#
-
- -
-
-std::atomic<SizeType32> mPinnedPool = {}#
-
- -
-
-std::atomic<DiffType> mGpuDiff = {}#
-
- -
-
-std::atomic<DiffType> mCpuDiff = {}#
-
- -
-
-std::atomic<DiffType> mPinnedDiff = {}#
-
- -
-
-std::atomic<DiffType> mUVMDiff = {}#
-
- -
-
-std::atomic<DiffType> mPinnedPoolDiff = {}#
-
- -
-
- -
- -
- -
-
-

ipcNvlsMemory.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-

Functions

-
-
-void MPI_group_barrier(std::set<int> ranks)#
-
- -
-
-bool ipcNvlsSupported()#
-
- -
-
-IpcNvlsHandle *ipcNvlsAllocate(size_t size, std::set<int> ranks)#
-
- -
-
-void ipcNvlsFree(IpcNvlsHandle *handle)#
-
- -
-
-
-template<typename T>
class DeviceAllocationNvls#
-
-

Public Functions

-
-
-DeviceAllocationNvls() = default#
-
- -
-
-inline ~DeviceAllocationNvls()#
-
- -
-
-inline void reset(size_t size, std::set<int> ranks)#
-
- -
-
-inline T *getMulticastPointer() const#
-
- -
-
-inline T *getUnicastPointer() const#
-
- -
-
-inline T **getIpcUnicastPointers()#
-
- -
-
-inline size_t getCapacity() const#
-
- -
-
-inline void free()#
-
- -
-
-

Private Members

-
-
-size_t _capacity = 0#
-
- -
-
-IpcNvlsHandle *_handle#
-
- -
-
- -
-
-struct IpcNvlsHandle#
-
-

Public Members

-
-
-size_t size = 0#
-
- -
-
-uintptr_t uc_ptr = 0#
-
- -
-
-uintptr_t mc_ptr = 0#
-
- -
-
-std::vector<uintptr_t> ipc_uc_ptrs#
-
- -
-
-CUdeviceptr uc_va#
-
- -
-
-CUdeviceptr mc_va#
-
- -
-
-std::vector<CUdeviceptr> ipc_uc_vas#
-
- -
-
-CUmemGenericAllocationHandle uc_handle#
-
- -
-
-CUmemGenericAllocationHandle mc_handle#
-
- -
-
-std::vector<CUmemGenericAllocationHandle> ipc_uc_handles#
-
- -
-
- -
- -
- -
-
-

rawEngine.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-
-class RawEngine#
-
-

Public Types

-
-
-enum Type#
-

Values:

-
-
-enumerator FilePath#
-
- -
-
-enumerator AddressWithSize#
-
- -
-
-enumerator HostMemory#
-
- -
- -
-
-

Public Functions

-
-
-inline explicit RawEngine(std::filesystem::path enginePath) noexcept#
-
- -
-
-inline explicit RawEngine( - -
-
void const *engineAddr,
-
std::size_t engineSize,
-
- -) noexcept#
-
- -
-
-inline explicit RawEngine( - -
-
nvinfer1::IHostMemory const *engineBuffer,
-
- -) noexcept#
-
- -
-
-inline Type getType() const#
-
- -
-
-inline std::filesystem::path getPath() const#
-
- -
-
-inline std::optional<std::filesystem::path> getPathOpt() const#
-
- -
-
-inline void setPath(std::filesystem::path enginePath)#
-
- -
-
-inline std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const &getManagedWeightsMapOpt( - -
-
- -) const#
-
- -
-
-inline void setManagedWeightsMap( - -
-
std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap,
-
- -)#
-
- -
-
-inline void const *getAddress() const#
-
- -
-
-inline std::size_t getSize() const#
-
- -
-
-inline nvinfer1::IHostMemory const *getHostMemory() const#
-
- -
-
-

Public Members

-
-
-void const *mEngineAddr = {}#
-
- -
-
-std::size_t mEngineSize = {}#
-
- -
-
-

Private Members

-
-
-Type mType#
-
- -
-
-std::optional<std::filesystem::path> mEnginePath#
-
- -
-
-struct tensorrt_llm::runtime::RawEngine
-
- -
-
-nvinfer1::IHostMemory const *mEngineBuffer = {}#
-
- -
-
-std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> mManagedWeightsMap#
-
- -
-
- -
- -
-

ipcUtils.h#

@@ -11123,1220 +12504,8 @@ one more than decoding draft tokens for prediction from primary head

-
-

iBuffer.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-

Typedefs

-
-
-template<typename T>
using PointerElementType = typename std::remove_reference_t<T>::element_type#
-
- -
-
-

Enums

-
-
-enum class MemoryType : std::int32_t#
-

Values:

-
-
-enumerator kGPU#
-
- -
-
-enumerator kCPU#
-
- -
-
-enumerator kPINNED#
-
- -
-
-enumerator kUVM#
-
- -
-
-enumerator kPINNEDPOOL#
-
- -
- -
-
-

Functions

-
-
-template<typename T>
std::shared_ptr<std::remove_const_t<T>> constPointerCast( - -
-
std::shared_ptr<T> const &ptr,
-
- -) noexcept#
-
- -
-
-template<typename T, typename D>
std::shared_ptr<std::remove_const_t<T>> constPointerCast( - -
-
std::unique_ptr<T, D> &&ptr,
-
- -) noexcept#
-
- -
-
-template<typename T>
T const *bufferCast(IBuffer const &buffer)#
-

Gets a typed pointer to the constant underlying data of the buffer.

-
-
Template Parameters:
-

T – The type of the underlying data.

-
-
Parameters:
-

buffer – The buffer to get a pointer to.

-
-
Returns:
-

A pointer to constant T.

-
-
-
- -
-
-template<typename T>
T *bufferCast(IBuffer &buffer)#
-

Gets a typed pointer to the underlying data of the buffer.

-
-
Template Parameters:
-

T – The type of the underlying data.

-
-
Parameters:
-

buffer – The buffer to get a pointer to.

-
-
Returns:
-

A pointer to T.

-
-
-
- -
-
-template<typename T>
T *bufferCastOrNull( - -
-
IBuffer::SharedPtr const &bufferPtr,
-
- -)#
-

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.

-
-
Template Parameters:
-

T – The type of the underlying data.

-
-
Parameters:
-

bufferPtr – A possibly null shared ptr.

-
-
Returns:
-

A pointer to T, possibly nullptr.

-
-
-
- -
-
-template<typename T>
T const *bufferCastOrNull( - -
-
IBuffer::SharedConstPtr const &bufferPtr,
-
- -)#
-

Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the bufferPtr, or nullptr if the bufferPtr is null.

-
-
Template Parameters:
-

T – The type of the underlying data.

-
-
Parameters:
-

bufferPtr – A possibly null shared ptr.

-
-
Returns:
-

A pointer to const T, possibly nullptr.

-
-
-
- -
-
-template<typename T>
T *bufferCastOrNull( - -
-
std::optional<IBuffer::SharedPtr> const &optionalBufferPtr,
-
- -)#
-

Retrieves a T typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

-
-
Template Parameters:
-

T – The type of the underlying data.

-
-
Parameters:
-

optionalBufferPtr – A possibly empty optional.

-
-
Returns:
-

A pointer to T, possibly nullptr.

-
-
-
- -
-
-template<typename T>
T const *bufferCastOrNull( - -
-
std::optional<IBuffer::SharedConstPtr> const &optionalBufferPtr,
-
- -)#
-

Retrieves a T const typed pointer to the underlying data of the buffer pointed to by the buffer pointer contained in the optionalBufferPtr, or nullptr if the optional doesn’t have a value.

-
-
Template Parameters:
-

T – The type of the underlying data.

-
-
Parameters:
-

optionalBufferPtr – A possibly empty optional.

-
-
Returns:
-

A pointer to const T, possibly nullptr.

-
-
-
- -
-
-std::ostream &operator<<( - -
-
std::ostream &output,
-
IBuffer const &buffer,
-
- -)#
-

Utility function to print a buffer.

-
- -
-
-
-class BufferDataType#
-
-#include <iBuffer.h>
-

A wrapper around nvinfer1::DataType that provides a support for pointer types.

-
-

Public Functions

-
-
-inline constexpr BufferDataType( - -
-
nvinfer1::DataType dataType,
-
bool _unsigned = false,
-
bool pointer = false,
-
- -)#
-
- -
-
-inline constexpr operator nvinfer1::DataType() const noexcept#
-
- -
-
-inline constexpr nvinfer1::DataType getDataType() const noexcept#
-
- -
-
-inline constexpr bool isPointer() const noexcept#
-
- -
-
-inline constexpr bool isUnsigned() const#
-
- -
-
-inline constexpr std::size_t getSize() const noexcept#
-
- -
-
-inline constexpr std::size_t getSizeInBits() const noexcept#
-
- -
-
-

Public Static Attributes

-
-
-static auto constexpr kTrtPointerType = nvinfer1::DataType::kINT64#
-
- -
-
-

Private Members

-
-
-nvinfer1::DataType mDataType#
-
- -
-
-bool mUnsigned#
-
- -
-
-bool mPointer#
-
- -
-
- -
-
-template<typename T>
class BufferRange : public tensorrt_llm::common::ArrayView<T>#
-
-

Public Types

-
-
-using Base = tensorrt_llm::common::ArrayView<T>#
-
- -
-
-

Public Functions

-
-
-inline BufferRange(T *data, size_type size)#
-
- -
-
-template<typename U = T, std::enable_if_t<!std::is_const_v<U>, bool> = true>
inline explicit BufferRange( - -
-
IBuffer &buffer,
-
- -)#
-
- -
-
-template<typename U = T, std::enable_if_t<std::is_const_v<U>, bool> = true>
inline explicit BufferRange( - -
-
IBuffer const &buffer,
-
- -)#
-
- -
-
- -
-
-template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits#
-
-#include <iBuffer.h>
-

For converting a TensorRT data type to a C++ data type.

-
- -
-
-template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>#
-
-

Public Types

-
-
-using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "*"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>#
-
-

Public Types

-
-
-using type = bool#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "bool"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kFLOAT>#
-
-

Public Types

-
-
-using type = float#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "float"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kHALF>#
-
-

Public Types

-
-
-using type = half#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "half"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32>#
-
-

Public Types

-
-
-using type = std::int32_t#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "int32"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true>#
-
-

Public Types

-
-
-using type = std::uint32_t#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "uint32"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64>#
-
-

Public Types

-
-
-using type = std::int64_t#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "int64"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true>#
-
-

Public Types

-
-
-using type = std::uint64_t#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "uint64"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<>
struct DataTypeTraits<nvinfer1::DataType::kINT8>#
-
-

Public Types

-
-
-using type = std::int8_t#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "int8"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>#
-
-

Public Types

-
-
-using type = std::uint8_t#
-
- -
-
-

Public Static Attributes

-
-
-static char constexpr name[] = "uint8"#
-
- -
-
-static auto constexpr size = sizeof(type)#
-
- -
-
- -
-
-class IBuffer#
-

Subclassed by tensorrt_llm::runtime::ITensor

-
-

Public Types

-
-
-using UniquePtr = std::unique_ptr<IBuffer>#
-
- -
-
-using SharedPtr = std::shared_ptr<IBuffer>#
-
- -
-
-using UniqueConstPtr = std::unique_ptr<IBuffer const>#
-
- -
-
-using SharedConstPtr = std::shared_ptr<IBuffer const>#
-
- -
-
-using DataType = nvinfer1::DataType#
-
- -
-
-

Public Functions

-
-
-virtual void *data() = 0#
-

Returns a pointer to underlying array.

-
- -
-
-virtual void const *data() const = 0#
-

Returns a pointer to underlying array.

-
- -
-
-inline virtual void *data(std::size_t index)#
-

Returns a pointer to the underlying array at a given element index.

-
- -
-
-inline virtual void const *data(std::size_t index) const#
-

Returns a pointer to the underlying array at a given element index.

-
- -
-
-virtual std::size_t getSize() const = 0#
-

Returns the size (in number of elements) of the buffer.

-
- -
-
-inline virtual std::size_t getSizeInBytes() const#
-

Returns the size (in bytes) of the buffer.

-
- -
-
-virtual std::size_t getCapacity() const = 0#
-

Returns the capacity of the buffer.

-
- -
-
-virtual DataType getDataType() const = 0#
-

Returns the data type of the buffer.

-
- -
-
-virtual char const *getDataTypeName() const#
-
- -
-
-virtual MemoryType getMemoryType() const = 0#
-

Returns the memory type of the buffer.

-
- -
-
-virtual char const *getMemoryTypeName() const#
-
- -
-
-virtual void resize(std::size_t newSize) = 0#
-

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

-
- -
-
-virtual void release() = 0#
-

Releases the buffer. It will be reset to nullptr.

-
- -
-
-virtual ~IBuffer() = default#
-
- -
-
-IBuffer(IBuffer const&) = delete#
-

Not allowed to copy.

-
- -
-
-IBuffer &operator=(IBuffer const&) = delete#
-

Not allowed to copy.

-
- -
-
-

Public Static Functions

-
-
-static char const *getDataTypeName(DataType dataType)#
-
- -
-
-static UniquePtr slice( - -
-
SharedPtr buffer,
-
std::size_t offset,
-
std::size_t size,
-
- -)#
-

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

-
-
Parameters:
-
    -
  • buffer – The buffer to view.

  • -
  • offset – The offset of the view.

  • -
  • size – The size of the view.

  • -
-
-
Returns:
-

A view on the buffer.

-
-
-
- -
-
-template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( - -
-
TConstPtr &&tensor,
-
std::size_t offset,
-
std::size_t size,
-
- -)#
-
- -
-
-static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)#
-
- -
-
-template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice( - -
-
TConstPtr &&tensor,
-
std::size_t offset,
-
- -)#
-
- -
-
-static inline UniquePtr view(SharedPtr tensor)#
-

Returns a view on the underlying tensor which can be independently resized.

-
-
Parameters:
-

tensor – The tensor to view.

-
-
Returns:
-

A view on the tensor.

-
-
-
- -
-
-static inline UniquePtr view(SharedPtr tensor, std::size_t size)#
-

Returns a view on the underlying tensor with a different size.

-
-
Parameters:
-
    -
  • tensor – The tensor to view.

  • -
  • size – The size of the view.

  • -
-
-
Returns:
-

A view on the tensor.

-
-
-
- -
-
-template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view( - -
-
TConstPtr &&tensor,
-
std::size_t size,
-
- -)#
-
- -
-
-static UniquePtr wrap( - -
-
void *data,
-
DataType type,
-
std::size_t size,
-
std::size_t capacity,
-
- -)#
-

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

-
-
Parameters:
-
    -
  • data – The data to wrap.

  • -
  • type – The data type of the data.

  • -
  • size – The size of the buffer.

  • -
  • capacity – The capacity of the buffer.

  • -
-
-
Returns:
-

An IBuffer.

-
-
-
- -
-
-static inline UniquePtr wrap( - -
-
void *data,
-
DataType type,
-
std::size_t size,
-
- -)#
-
- -
-
-template<typename T>
static inline UniquePtr wrap( - -
-
T *data,
-
std::size_t size,
-
std::size_t capacity,
-
- -)#
-
- -
-
-template<typename T>
static inline UniquePtr wrap( - -
-
T *data,
-
std::size_t size,
-
- -)#
-
- -
-
-template<typename T>
static inline UniquePtr wrap( - -
-
std::vector<T> &v,
-
- -)#
-
- -
-
-static MemoryType memoryType(void const *data)#
-

Determine the memory type of a pointer.

-
- -
-
-

Protected Functions

-
-
-IBuffer() = default#
-
- -
-
-inline std::size_t toBytes(std::size_t size) const#
-

Returns an array index or size in bytes.

-
- -
-
- -
-
-template<MemoryType T>
struct MemoryTypeString#
-
- -
-
-template<>
struct MemoryTypeString<MemoryType::kCPU>#
-
-

Public Static Attributes

-
-
-static auto constexpr value = "CPU"#
-
- -
-
- -
-
-template<>
struct MemoryTypeString<MemoryType::kGPU>#
-
-

Public Static Attributes

-
-
-static auto constexpr value = "GPU"#
-
- -
-
- -
-
-template<>
struct MemoryTypeString<MemoryType::kPINNED>#
-
-

Public Static Attributes

-
-
-static auto constexpr value = "PINNED"#
-
- -
-
- -
-
-template<>
struct MemoryTypeString<MemoryType::kPINNEDPOOL>#
-
-

Public Static Attributes

-
-
-static auto constexpr value = "PINNEDPOOL"#
-
- -
-
- -
-
-template<>
struct MemoryTypeString<MemoryType::kUVM>#
-
-

Public Static Attributes

-
-
-static auto constexpr value = "UVM"#
-
- -
-
- -
-
-template<typename T, bool = false>
struct TRTDataType#
-
-#include <iBuffer.h>
-

For converting a C++ data type to a TensorRT data type.

-
- -
-
-template<>
struct TRTDataType<bool>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kBOOL#
-
- -
-
- -
-
-template<>
struct TRTDataType<float>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kFLOAT#
-
- -
-
- -
-
-template<>
struct TRTDataType<half>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kHALF#
-
- -
-
- -
-
-template<>
struct TRTDataType<kernels::FinishedState>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = TRTDataType<kernels::FinishedState::UnderlyingType>::value#
-
- -
-
- -
-
-template<>
struct TRTDataType<kernels::KVCacheIndex>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = TRTDataType<kernels::KVCacheIndex::UnderlyingType>::value#
-
- -
-
- -
-
-template<>
struct TRTDataType<runtime::RequestType>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = TRTDataType<std::underlying_type_t<runtime::RequestType>>::value#
-
- -
-
- -
-
-template<>
struct TRTDataType<std::int32_t>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kINT32#
-
- -
-
- -
-
-template<>
struct TRTDataType<std::int64_t>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kINT64#
-
- -
-
- -
-
-template<>
struct TRTDataType<std::int8_t>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kINT8#
-
- -
-
- -
-
-template<>
struct TRTDataType<std::uint32_t>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}#
-
- -
-
- -
-
-template<>
struct TRTDataType<std::uint64_t>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}#
-
- -
-
- -
-
-template<>
struct TRTDataType<std::uint8_t>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = nvinfer1::DataType::kUINT8#
-
- -
-
- -
-
-template<typename T>
struct TRTDataType<T*>#
-
-

Public Static Attributes

-
-
-static auto constexpr value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}#
-
- -
-
-

Private Static Attributes

-
-
-static auto constexpr kUnderlyingType = BufferDataType{TRTDataType<std::remove_const_t<T>, false>::value}#
-
- -
-
- -
-
-template<>
struct TRTDataType<void*>#
-
-

Public Static Attributes

-
-
-static constexpr auto value = BufferDataType::kTrtPointerType#
-
- -
-
- -
- -
- -
-
-

gptJsonConfig.h#

+
+

memoryCounters.h#

namespace tensorrt_llm
@@ -12344,398 +12513,172 @@ one more than decoding draft tokens for prediction from primary head

namespace runtime
-
-class GptJsonConfig#
+
+class MemoryCounters#
+

Public Types

+
+
+using SizeType32 = std::size_t#
+
+ +
+
+using DiffType = std::ptrdiff_t#
+
+ +
+

Public Functions

-
-inline GptJsonConfig( - -
-
std::string name,
-
std::string version,
-
std::string precision,
-
SizeType32 tensorParallelism,
-
SizeType32 pipelineParallelism,
-
SizeType32 contextParallelism,
-
SizeType32 gpusPerNode,
-
ModelConfig modelConfig,
-
std::optional<RuntimeDefaults> runtimeDefaults = std::nullopt,
-
- -)#
+
+MemoryCounters() = default#
-
-inline ModelConfig const &getModelConfig() const#
+
+inline SizeType32 getGpu() const#
-
-inline ModelConfig &getModelConfigMutable()#
+
+inline SizeType32 getCpu() const#
-
-inline std::string const &getName() const#
+
+inline SizeType32 getPinned() const#
-
-inline std::string const &getVersion() const#
+
+inline SizeType32 getUVM() const#
-
-inline std::string const &getPrecision() const#
+
+inline SizeType32 getPinnedPool() const#
-
-inline SizeType32 constexpr getTensorParallelism() const#
+
+inline DiffType getGpuDiff() const#
-
-inline SizeType32 constexpr getPipelineParallelism() const#
+
+inline DiffType getCpuDiff() const#
-
-inline SizeType32 constexpr getContextParallelism() const#
+
+inline DiffType getPinnedDiff() const#
-
-inline SizeType32 constexpr getGpusPerNode() const#
+
+inline DiffType getUVMDiff() const#
-
-inline SizeType32 constexpr getWorldSize() const#
+
+inline DiffType getPinnedPoolDiff() const#
-
-inline std::optional<RuntimeDefaults> getRuntimeDefaults() const#
+
+template<MemoryType T>
inline void allocate(SizeType32 size)#
-
-std::string engineFilename( - -
-
WorldConfig const &worldConfig,
-
std::string const &model,
-
- -) const#
+
+void allocate(MemoryType memoryType, SizeType32 size)#
-
-inline std::string engineFilename( +
+template<MemoryType T>
inline void deallocate(SizeType32 size)#
+
-
-
WorldConfig const &worldConfig,
-
+
+
+void deallocate(MemoryType memoryType, SizeType32 size)#
+
-) const#
+
+
+std::string toString() const#

Public Static Functions

-
-static GptJsonConfig parse(std::string const &json)#
+
+static MemoryCounters &getInstance()#
-
-static GptJsonConfig parse(std::istream &json)#
+
+static std::string bytesToString(SizeType32 bytes, int precision = 2)#
-
-static GptJsonConfig parse(std::filesystem::path const &path)#
+
+static std::string bytesToString(DiffType bytes, int precision = 2)#

Private Members

-
-std::string const mName#
+
+std::atomic<SizeType32> mGpu = {}#
-
-std::string const mVersion#
+
+std::atomic<SizeType32> mCpu = {}#
-
-std::string const mPrecision#
+
+std::atomic<SizeType32> mPinned = {}#
-
-SizeType32 const mTensorParallelism#
+
+std::atomic<SizeType32> mUVM = {}#
-
-SizeType32 const mPipelineParallelism#
+
+std::atomic<SizeType32> mPinnedPool = {}#
-
-SizeType32 const mContextParallelism#
+
+std::atomic<DiffType> mGpuDiff = {}#
-
-SizeType32 const mGpusPerNode#
+
+std::atomic<DiffType> mCpuDiff = {}#
-
-ModelConfig mModelConfig#
+
+std::atomic<DiffType> mPinnedDiff = {}#
-
-std::optional<RuntimeDefaults> mRuntimeDefaults#
-
- -
-
- -
- - - -
-
-

loraCachePageManagerConfig.h#

-
-
-namespace tensorrt_llm
-
-
-namespace runtime
-
-

Functions

-
-
-inline std::ostream &operator<<( - -
-
std::ostream &os,
-
LoraCachePageManagerConfig const &c,
-
- -)#
-
- -
-
-inline std::string to_string(LoraCachePageManagerConfig const &c)#
-
- -
-
-
-class LoraCachePageManagerConfig#
-
-#include <loraCachePageManagerConfig.h>
-

Configuration for LoraCachePageManager

-

See LoraCache docs for description of pages, slots, and page blocks.

-
-

Public Functions

-
-
-inline explicit constexpr LoraCachePageManagerConfig( - -
-
runtime::MemoryType memType,
-
nvinfer1::DataType dType,
-
SizeType32 totalNumPages,
-
SizeType32 maxPagesPerBlock,
-
SizeType32 slotsPerPage,
-
SizeType32 pageWidth,
-
SizeType32 numCopyStreams,
-
- -)#
-
- -
-
-inline runtime::MemoryType constexpr getMemoryType() const noexcept#
-
- -
-
-inline void constexpr setMemoryType( - -
-
runtime::MemoryType const &memoryType,
-
- -) noexcept#
-
- -
-
-inline nvinfer1::DataType constexpr getDataType() const noexcept#
-
- -
-
-inline void constexpr setDataType( - -
-
nvinfer1::DataType const &dtype,
-
- -) noexcept#
-
- -
-
-inline SizeType32 constexpr getTotalNumPages() const noexcept#
-
- -
-
-inline void constexpr setTotalNumPage( - -
-
SizeType32 const &totalNumPages,
-
- -) noexcept#
-
- -
-
-inline SizeType32 constexpr getMaxPagesPerBlock() const noexcept#
-
- -
-
-inline void constexpr setMaxPagesPerBlock( - -
-
SizeType32 const &maxPagesPerBlock,
-
- -) noexcept#
-
- -
-
-inline SizeType32 constexpr getSlotsPerPage() const noexcept#
-
- -
-
-inline void constexpr setSlotsPerPage( - -
-
SizeType32 const &slotsPerPage,
-
- -) noexcept#
-
- -
-
-inline SizeType32 constexpr getPageWidth() const noexcept#
-
- -
-
-inline void constexpr setPageWidth( - -
-
SizeType32 const &pageWidth,
-
- -) noexcept#
-
- -
-
-inline bool constexpr getInitToZero() const noexcept#
-
- -
-
-inline void constexpr setInitToZero(bool initToZero) noexcept#
-
- -
-
-inline SizeType32 constexpr getNumCopyStreams() const noexcept#
-
- -
-
-inline void constexpr setNumCopyStreams( - -
-
SizeType32 numCopyStreams,
-
- -) noexcept#
-
- -
-
-

Private Members

-
-
-runtime::MemoryType mMemoryType#
+
+std::atomic<DiffType> mUVMDiff = {}#
-
-nvinfer1::DataType mDataType#
-
- -
-
-SizeType32 mTotalNumPages#
-
- -
-
-SizeType32 mMaxPagesPerBlock#
-
- -
-
-SizeType32 mSlotsPerPage#
-
- -
-
-SizeType32 mPageWidth#
-
- -
-
-SizeType32 mNumCopyStreams = 1#
-
- -
-
-bool mInitToZero#
+
+std::atomic<DiffType> mPinnedPoolDiff = {}#
@@ -12779,72 +12722,42 @@ one more than decoding draft tokens for prediction from primary head

@@ -14691,10 +14621,12 @@ one more than decoding draft tokens for prediction from primary head

+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
@@ -794,6 +807,109 @@ +class MoeLoadBalancerConfig(StrictBaseModel): + """ + Pydantic configuration model for the Mixture of Experts (MoE) load balancer. + + This model holds configuration data (`num_slots`, etc.) as well as + runtime state (`_ep_rank`, `_ep_size`) which must be set via the + `setup()` method before use. + """ + + num_slots: Optional[int] = None + initial_global_assignments: Optional[Dict[int, List[int]]] = Field( + default=None, + repr=False # Exclude this large dict from model representation + ) + layer_updates_per_iter: int = 0 + _ep_rank: Optional[int] = PrivateAttr(default=None) + _ep_size: Optional[int] = PrivateAttr(default=None) + + # --- Methods --- + + def setup(self, ep_rank: int, ep_size: int) -> None: + """ + Initializes the runtime state of the configuration. + This must be called before accessing properties like `num_local_slots`. + """ + self._ep_rank = ep_rank + self._ep_size = ep_size + + # This assertion was in the original and is critical. + if self.num_slots is None: + raise ValueError("`num_slots` cannot be None when calling setup().") + + if self.num_slots % ep_size != 0: + raise ValueError( + f"`num_slots` ({self.num_slots}) must be divisible by `ep_size` ({ep_size})." + ) + + # --- Computed Properties --- + # These properties depend on the runtime state set by setup() + + @property + def ep_rank(self) -> int: + """Public accessor for the private expert parallel rank.""" + if self._ep_rank is None: + raise AttributeError("ep_rank is not set. Call setup() first.") + return self._ep_rank + + @property + def ep_size(self) -> int: + """Public accessor for the private expert parallel size.""" + if self._ep_size is None: + raise AttributeError("ep_size is not set. Call setup() first.") + return self._ep_size + + @property + def num_local_slots(self) -> int: + """Calculates the number of slots local to this rank.""" + if self.num_slots is None or self._ep_size is None: + raise ValueError( + "Cannot calculate `num_local_slots`. " + "`num_slots` must be set and setup() must be called.") + return self.num_slots // self._ep_size + + @property + def slot_start(self) -> int: + """Calculates the starting global slot index for this rank.""" + if self._ep_rank is None: + raise ValueError( + "Cannot calculate `slot_start`. Call setup() first.") + return self._ep_rank * self.num_local_slots + + @property + def slot_end(self) -> int: + """Calculates the ending global slot index (exclusive) for this rank.""" + return self.slot_start + self.num_local_slots + + def get_layer_initial_global_assignments( + self, layer_idx: int) -> Optional[List[int]]: + """ + Retrieves the initial global assignments for a specific layer. + """ + if self.initial_global_assignments is None: + return None + + if layer_idx not in self.initial_global_assignments: + raise KeyError( + f"layer_idx {layer_idx} not found in `initial_global_assignments`." + ) + + assignments = self.initial_global_assignments[layer_idx] + + if self.num_slots is None: + raise ValueError( + "`num_slots` is not set, cannot verify assignment length.") + + if len(assignments) != self.num_slots: + raise ValueError( + f"Assignment length ({len(assignments)}) for layer {layer_idx} " + f"does not match `num_slots` ({self.num_slots}).") + + return assignments + +
[docs] class MoeConfig(StrictBaseModel): @@ -871,6 +987,7 @@ moe_tp_size: int = -1 moe_ep_size: int = -1 cp_config: dict = Field(default_factory=dict) + pp_partition: Optional[List[int]] = Field(default=None) enable_attention_dp: bool = False enable_lm_head_tp_in_adp: bool = False @@ -917,6 +1034,7 @@ gpus_per_node=self.gpus_per_node, tp_size=self.tp_size, pp_size=self.pp_size, + pp_partition=self.pp_partition, cp_size=self.cp_size, cp_config=self.cp_config, enable_attention_dp=self.enable_attention_dp, @@ -1003,6 +1121,16 @@ # this value. Otherwise, speculation will always be on. max_concurrency: Optional[int] = None + # Developer interface: dynamically adjust draft length based on active batch size in runtime. + # Maps batch size to draft lengths. For example: + # {1: 4, 4: 2, 8: 0} means: + # - batch_size >= 1: use draft_len=4 + # - batch_size >= 4: use draft_len=2 + # - batch_size >= 8: use draft_len=0 (disable speculation) + # draft_len_schedule is enforced to contain batch_size=1 and its according draft_len equals max_draft_len for consistency + # for example, if max_draft_len=4, the schedule must contain {1: 4} + draft_len_schedule: Optional[dict[int, int]] = None + load_format: Optional[str] = None # PyTorch only. # Rolling average window size (N) for acceptance length across completed requests. @@ -1040,6 +1168,51 @@ # If set, drafting uses greedy sampling, irrespective of sampling parameters. _allow_greedy_draft_tokens: bool = PrivateAttr(True) + @field_validator('draft_len_schedule') + @classmethod + def validate_draft_len_schedule_and_sort(cls, v, info): + """Validate and sort draft_len_schedule by batch size thresholds.""" + if v is not None: + # Validate values + for batch_size, draft_len in v.items(): + if batch_size < 1: + raise ValueError( + f"draft_len_schedule: batch size threshold must be >= 1, got {batch_size}" + ) + if draft_len < 0: + raise ValueError( + f"draft_len_schedule: draft length must be >= 0, got {draft_len}" + ) + + # Require batch_size=1 in schedule + if 1 not in v: + raise ValueError( + "draft_len_schedule must include batch_size=1. " + "All systems can have batch_size=1. Add {1: <max_draft_len>} to your schedule." + ) + + # Enforce schedule[1] == max_draft_len for consistency + max_draft_len = info.data.get('max_draft_len') + if max_draft_len is not None and v[1] != max_draft_len: + raise ValueError( + f"draft_len_schedule[1] must equal max_draft_len for consistency. " + f"Got schedule[1]={v[1]}, but max_draft_len={max_draft_len}. " + f"batch_size=1 should use maximum draft length.") + + # Enforce all draft lengths <= max_draft_len + if max_draft_len is not None: + for batch_size, draft_len in v.items(): + if draft_len > max_draft_len: + raise ValueError( + f"draft_len_schedule: all draft lengths must be <= max_draft_len. " + f"Got draft_len={draft_len} for batch_size={batch_size}, " + f"but max_draft_len={max_draft_len}.") + + # Return sorted dict (by batch size thresholds) + # This ensures efficient lookup + return dict(sorted(v.items(), key=lambda x: x[0])) + return v + @classmethod def from_dict(cls, data: dict): # dispatch to the correct decoding config @@ -1445,12 +1618,11 @@ # Now we need a flag when MTPDecodingConfig is updated by PyTorchModelEngine. num_nextn_predict_layers_from_model_config: int = 1 - # TODO: Hard code for DeepSeek R1 # When encounter <think>, start thinking phase. # When encounter </think>, end thinking phase. # <think> [thinking phase] </think> [real output] - BEGIN_THINKING_PHASE_TOKEN: int = 128798 - END_THINKING_PHASE_TOKEN: int = 128799 + begin_thinking_phase_token: int = 128798 + end_thinking_phase_token: int = 128799
[docs] @@ -2158,11 +2330,20 @@ "Timeout in milliseconds for KV cache transfer. Requests exceeding this timeout will be cancelled." ) + kv_transfer_sender_future_timeout_ms: Optional[int] = Field( + default=1000, + gt=0, + description= + "Timeout in milliseconds to wait for the sender future to be ready when scheduled batch size is 0. This allows the request to be eventually cancelled by the user or because of kv_transfer_timeout_ms" + ) + def _to_pybind(self): return _CacheTransceiverConfig( backend=_CacheTransceiverBackendType.from_string(self.backend), max_tokens_in_buffer=self.max_tokens_in_buffer, - kv_transfer_timeout_ms=self.kv_transfer_timeout_ms)
+ kv_transfer_timeout_ms=self.kv_transfer_timeout_ms, + kv_transfer_sender_future_timeout_ms=self. + kv_transfer_sender_future_timeout_ms)
@@ -2288,6 +2469,12 @@ description="Enable LM head TP in attention dp.", status="prototype") + pp_partition: Optional[List[int]] = Field( + default=None, + description= + "Pipeline parallel partition, a list of each rank's layer number.", + status="prototype") + cp_config: Optional[dict] = Field(default_factory=dict, description="Context parallel config.", status="prototype") @@ -2544,6 +2731,7 @@ moe_ep_size=self.moe_expert_parallel_size, enable_attention_dp=self.enable_attention_dp, enable_lm_head_tp_in_adp=self.enable_lm_head_tp_in_adp, + pp_partition=self.pp_partition, cp_config=self.cp_config) return self @@ -3323,6 +3511,9 @@ # PrivateVars _quant_config: Optional[QuantConfig] = PrivateAttr(default=None) + _disable_flash_infer_sampling: bool = PrivateAttr(default=True) + """Unless this is set to False, FlashInfer.sampling is not used, even if available.""" + @property def quant_config(self) -> QuantConfig: if self._quant_config is None: @@ -3405,7 +3596,6 @@ [docs] @model_validator(mode="after") def validate_load_balancer(self) -> 'TorchLlmArgs': - from .._torch import MoeLoadBalancerConfig if isinstance(self.moe_config.load_balancer, str): if not os.path.exists(self.moe_config.load_balancer): raise FileNotFoundError( @@ -3607,82 +3797,6 @@ executor_config = super().get_executor_config(_hf_model_dir, tokenizer) executor_config.mm_encoder_only = self.mm_encoder_only return executor_config - - - # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig -
-[docs] - def get_pytorch_backend_config(self) -> "PyTorchConfig": - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - - return PyTorchConfig( - extra_resource_managers=self.extra_resource_managers, - use_cuda_graph=bool(self.cuda_graph_config is not None), - cuda_graph_batch_sizes=self.cuda_graph_config.batch_sizes - if self.cuda_graph_config else - CudaGraphConfig.model_fields['batch_sizes'].default, - cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size - if self.cuda_graph_config else - CudaGraphConfig.model_fields['max_batch_size'].default, - cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding - if self.cuda_graph_config else - CudaGraphConfig.model_fields['enable_padding'].default, - disable_overlap_scheduler=self.disable_overlap_scheduler, - moe_max_num_tokens=self.moe_config.max_num_tokens, - moe_load_balancer=self.moe_config.load_balancer, - attn_backend=self.attn_backend, - moe_backend=self.moe_config.backend, - use_low_precision_moe_combine=self.moe_config. - use_low_precision_moe_combine, - sampler_type=self.sampler_type, - kv_cache_dtype=self.kv_cache_config.dtype, - mamba_ssm_cache_dtype=self.kv_cache_config.mamba_ssm_cache_dtype, - enable_iter_perf_stats=self.enable_iter_perf_stats, - enable_iter_req_stats=self.enable_iter_req_stats, - print_iter_log=self.print_iter_log, - torch_compile_enabled=bool(self.torch_compile_config is not None), - torch_compile_fullgraph=self.torch_compile_config.enable_fullgraph - if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['enable_fullgraph'].default, - torch_compile_inductor_enabled=self.torch_compile_config. - enable_inductor if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['enable_inductor'].default, - torch_compile_piecewise_cuda_graph=self.torch_compile_config. - enable_piecewise_cuda_graph - if self.torch_compile_config is not None else TorchCompileConfig. - model_fields['enable_piecewise_cuda_graph'].default, - torch_compile_piecewise_cuda_graph_num_tokens=self. - torch_compile_config.capture_num_tokens - if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['capture_num_tokens'].default, - torch_compile_enable_userbuffers=self.torch_compile_config. - enable_userbuffers if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['enable_userbuffers'].default, - torch_compile_max_num_streams=self.torch_compile_config. - max_num_streams if self.torch_compile_config is not None else - TorchCompileConfig.model_fields['max_num_streams'].default, - enable_autotuner=self.enable_autotuner, - enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker, - load_format=self.load_format, - enable_min_latency=self.enable_min_latency, - moe_disable_finalize_fusion=self.moe_config.disable_finalize_fusion, - stream_interval=self.stream_interval, - force_dynamic_quantization=self.force_dynamic_quantization, - allreduce_strategy=self.allreduce_strategy, - attention_dp_enable_balance=bool( - self.attention_dp_config is not None - and self.attention_dp_config.enable_balance), - attention_dp_time_out_iters=self.attention_dp_config.timeout_iters - if self.attention_dp_config is not None else - AttentionDpConfig.model_fields['timeout_iters'].default, - attention_dp_batching_wait_iters=self.attention_dp_config. - batching_wait_iters if self.attention_dp_config is not None else - AttentionDpConfig.model_fields['batching_wait_iters'].default, - batch_wait_timeout_ms=self.batch_wait_timeout_ms, - batch_wait_timeout_iters=self.batch_wait_timeout_iters, - batch_wait_max_tokens_ratio=self.batch_wait_max_tokens_ratio, - enable_sleep=self.enable_sleep, - )
@@ -3820,10 +3934,12 @@ + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
diff --git a/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html b/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html index 88799d490f..5b8231a4fc 100644 --- a/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html +++ b/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,22 +61,26 @@ + + + - + + @@ -82,6 +88,8 @@ + +
@@ -734,10 +742,12 @@ + +
+ +
diff --git a/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html b/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html index 00fb7ac5d0..f20fa3e2b5 100644 --- a/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html +++ b/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,22 +61,26 @@ + + + - + + @@ -82,6 +88,8 @@ + +
@@ -901,10 +909,12 @@ always defer defer+madvise + +
diff --git a/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html b/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html index 8080641cea..69d652b815 100644 --- a/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html +++ b/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,22 +61,26 @@ + + + - + + @@ -82,6 +88,8 @@ + +
@@ -926,10 +934,12 @@ others according to your needs.

+ +
+ +
+ +
+ +
+ +
@@ -923,10 +931,12 @@ trtllm-serve ${m + +
+ +
diff --git a/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html index 3b8bb3fbd5..9f414e5dac 100644 --- a/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,22 +61,26 @@ + + + - + + @@ -82,6 +88,8 @@ + +
@@ -580,7 +588,7 @@ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc2 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc3 \ /bin/bash @@ -1097,10 +1105,12 @@ chmod +x bench.sh + +
@@ -1057,10 +1065,12 @@ chmod +x bench.sh + +
@@ -975,10 +983,12 @@ chmod +x bench.sh + +
@@ -1003,10 +1011,12 @@ chmod +x bench.sh + +
+ +
+ +

2. Hierarchical Configuration#

Organize complex or hierarchical arguments into dedicated configuration dataclasses with intuitive and consistent naming.

Guidelines

-
    -
  • Use the XxxConfig suffix consistently
    -Examples: ModelConfig, ParallelConfig, MoeConfig

  • -
  • Reflect conceptual hierarchy
    -The dataclass name should represent a coherent functional unit, not an arbitrary grouping

  • -
  • Avoid over-nesting
    -Use only one level of configuration hierarchy whenever possible (e.g., LlmArgs ParallelConfig) to balance readability and modularity

  • +
      +
    • Use the XxxConfig suffix consistently

      +

      Examples: ModelConfig, ParallelConfig, MoeConfig

      +
    • +
    • Reflect conceptual hierarchy

      +

      The dataclass name should represent a coherent functional unit, not an arbitrary grouping

      +
    • +
    • Avoid over-nesting

      +

      Use only one level of configuration hierarchy whenever possible (e.g., LlmArgs ParallelConfig) to balance readability and modularity

      +
@@ -671,7 +684,7 @@ Use only one level of configuration hierarchy whenever possible (e.g., 2. Update the API schema#

Add the field to the appropriate schema file:

    -
  • Non-committed arguments: tests/unittest/api_stability/references/llm_args.yaml

    +
  • Non-committed arguments: tests/unittest/api_stability/references/llm.yaml

    garbage_collection_gen0_threshold:
       type: int
       default: 20000
    @@ -679,7 +692,7 @@ Use only one level of configuration hierarchy whenever possible (e.g., 
  • -
  • Committed arguments: tests/unittest/api_stability/references_committed/llm_args.yaml

    +
  • Committed arguments: tests/unittest/api_stability/references_committed/llm.yaml

    garbage_collection_gen0_threshold:
       type: int
       default: 20000
    @@ -715,16 +728,16 @@ Use only one level of configuration hierarchy whenever possible (e.g., For non-committed APIs, use the @set_api_status decorator:

    @set_api_status("beta")
     def generate_with_streaming(
    -    self, 
    -    prompts: List[str], 
    +    self,
    +    prompts: List[str],
         **kwargs
     ) -> Iterator[GenerationOutput]:
         """Generate text with streaming output.
    -    
    +
         Args:
             prompts: Input prompts for generation
             **kwargs: Additional generation parameters
    -        
    +
         Returns:
             Iterator of generation outputs
         """
    @@ -946,10 +959,12 @@ python -m pytest <
         
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    diff --git a/latest/developer-guide/perf-overview.html b/latest/developer-guide/perf-overview.html index 102a022ef3..a65e07b878 100644 --- a/latest/developer-guide/perf-overview.html +++ b/latest/developer-guide/perf-overview.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,20 +61,24 @@ + + + - + + @@ -80,6 +86,8 @@ + +
    @@ -1352,7 +1360,7 @@ nvidia/Qwen3-235B-A22B-FP8

Preparing a Dataset#

-

In order to prepare a dataset, you can use the provided script. +

In order to prepare a dataset, you can use the provided script. To generate a synthetic dataset, run the following command:

python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file
 
@@ -1432,7 +1440,7 @@ remain in the system longer and therefore require less requests to achieve stead

Running the Benchmark#

To run the benchmark with the generated data set, simply use the trtllm-bench throughput subcommand. The benchmarker will run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide -a model name (HuggingFace reference or path to a local model), a generated dataset, and a file containing any desired extra options to the LLM APIs (details in tensorrt_llm/llmapi/llm_args.py:LlmArgs).

+a model name (HuggingFace reference or path to a local model), a generated dataset, and a file containing any desired extra options to the LLM APIs (details in tensorrt_llm/llmapi/llm_args.py:LlmArgs).

For dense / non-MoE models:

trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
 
@@ -1610,10 +1618,12 @@ using the --kv_cach
+ +
+ +
+ +
@@ -710,10 +718,12 @@ A more efficient C++ implementation is included in the + +
+ +
+ +
+ +
diff --git a/latest/features/sampling.html b/latest/features/sampling.html index d25e8f6f8f..6e611c8c0d 100644 --- a/latest/features/sampling.html +++ b/latest/features/sampling.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,22 +61,26 @@ + + + - + + @@ -82,6 +88,8 @@ + +
@@ -514,7 +522,7 @@

To use the feature:

  1. Enable the enable_trtllm_sampler option in the LLM class

  2. -
  3. Pass a SamplingParams object with the desired options to the generate() function

  4. +
  5. Pass a SamplingParams object with the desired options to the generate() function

The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:

from tensorrt_llm import LLM, SamplingParams
@@ -579,7 +587,7 @@
 

To enable guided decoding, you must:

  1. Set the guided_decoding_backend parameter to 'xgrammar' or 'llguidance' in the LLM class

  2. -
  3. Create a GuidedDecodingParams object with the desired format specification

    +
  4. Create a GuidedDecodingParams object with the desired format specification

    • Note: Depending on the type of format, a different parameter needs to be chosen to construct the object (json, regex, grammar, structural_tag).

    @@ -600,14 +608,14 @@ llm.generate("Generate a JSON response", sampling_params)
-

You can find a more detailed example on guided decoding here.

+

You can find a more detailed example on guided decoding here.

Logits processor#

Logits processors allow you to modify the logits produced by the network before sampling, enabling custom generation behavior and constraints.

To use a custom logits processor:

    -
  1. Create a custom class that inherits from LogitsProcessor and implements the __call__ method

  2. +
  3. Create a custom class that inherits from LogitsProcessor and implements the __call__ method

  4. Pass an instance of this class to the logits_processor parameter of SamplingParams

The following example demonstrates logits processing:

@@ -635,7 +643,7 @@ llm.generate(["Hello, my name is"], sampling_params) -

You can find a more detailed example on logits processors here.

+

You can find a more detailed example on logits processors here.

@@ -708,10 +716,12 @@ + +
@@ -641,10 +658,12 @@ The setup methods depends on your slurm configuration, pls check with your admin + +