From 81ab29c8b7e5fd1a83f734cf5908d5748bd8129c Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Fri, 21 Nov 2025 07:33:26 +0000 Subject: [PATCH] Update latest GitHub pages to v1.2.0rc3 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 6157 ++--- latest/_cpp_gen/runtime.html | 19741 ++++++++-------- .../attention.py | 290 +- .../model_engine.py | 385 +- latest/_modules/index.html | 27 +- latest/_modules/tensorrt_llm/builder.html | 27 +- .../tensorrt_llm/disaggregated_params.html | 27 +- .../tensorrt_llm/executor/request.html | 27 +- .../tensorrt_llm/executor/result.html | 64 +- .../_modules/tensorrt_llm/executor/utils.html | 27 +- latest/_modules/tensorrt_llm/functional.html | 27 +- .../tensorrt_llm/layers/activation.html | 27 +- .../tensorrt_llm/layers/attention.html | 27 +- latest/_modules/tensorrt_llm/layers/cast.html | 27 +- latest/_modules/tensorrt_llm/layers/conv.html | 27 +- .../tensorrt_llm/layers/embedding.html | 27 +- .../_modules/tensorrt_llm/layers/linear.html | 27 +- latest/_modules/tensorrt_llm/layers/mlp.html | 27 +- .../tensorrt_llm/layers/normalization.html | 27 +- .../_modules/tensorrt_llm/layers/pooling.html | 27 +- .../tensorrt_llm/llmapi/build_cache.html | 27 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 27 +- .../tensorrt_llm/llmapi/llm_args.html | 307 +- .../tensorrt_llm/llmapi/mm_encoder.html | 27 +- .../tensorrt_llm/llmapi/mpi_session.html | 27 +- .../tensorrt_llm/models/baichuan/model.html | 27 +- .../tensorrt_llm/models/bert/model.html | 27 +- .../tensorrt_llm/models/bloom/model.html | 27 +- .../tensorrt_llm/models/chatglm/config.html | 27 +- .../tensorrt_llm/models/chatglm/model.html | 27 +- .../tensorrt_llm/models/clip/model.html | 27 +- .../tensorrt_llm/models/cogvlm/config.html | 27 +- .../tensorrt_llm/models/cogvlm/model.html | 27 +- .../tensorrt_llm/models/commandr/model.html | 27 +- .../tensorrt_llm/models/dbrx/config.html | 27 +- .../tensorrt_llm/models/dbrx/model.html | 27 +- .../models/deepseek_v1/model.html | 27 +- .../models/deepseek_v2/model.html | 27 +- .../tensorrt_llm/models/dit/model.html | 27 +- .../tensorrt_llm/models/eagle/model.html | 27 +- .../tensorrt_llm/models/enc_dec/model.html | 27 +- .../tensorrt_llm/models/falcon/config.html | 27 +- .../tensorrt_llm/models/falcon/model.html | 27 +- .../tensorrt_llm/models/gemma/config.html | 27 +- .../tensorrt_llm/models/gemma/model.html | 27 +- .../tensorrt_llm/models/gpt/config.html | 27 +- .../tensorrt_llm/models/gpt/model.html | 27 +- .../tensorrt_llm/models/gptj/config.html | 27 +- .../tensorrt_llm/models/gptj/model.html | 27 +- .../tensorrt_llm/models/gptneox/model.html | 27 +- .../tensorrt_llm/models/llama/config.html | 27 +- .../tensorrt_llm/models/llama/model.html | 27 +- .../tensorrt_llm/models/mamba/model.html | 27 +- .../tensorrt_llm/models/medusa/config.html | 27 +- .../tensorrt_llm/models/medusa/model.html | 27 +- .../tensorrt_llm/models/mllama/model.html | 27 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 27 +- .../tensorrt_llm/models/modeling_utils.html | 27 +- .../tensorrt_llm/models/mpt/model.html | 27 +- .../models/multimodal_encoders/config.html | 27 +- .../models/multimodal_encoders/model.html | 27 +- .../tensorrt_llm/models/opt/model.html | 27 +- .../tensorrt_llm/models/phi/model.html | 27 +- .../tensorrt_llm/models/phi3/model.html | 27 +- .../models/recurrentgemma/model.html | 27 +- .../tensorrt_llm/models/redrafter/model.html | 27 +- .../_modules/tensorrt_llm/plugin/plugin.html | 27 +- .../tensorrt_llm/quantization/mode.html | 27 +- .../quantization/quantize_by_modelopt.html | 27 +- .../runtime/enc_dec_model_runner.html | 27 +- .../tensorrt_llm/runtime/generation.html | 27 +- .../runtime/kv_cache_manager.html | 27 +- .../tensorrt_llm/runtime/model_runner.html | 27 +- .../runtime/model_runner_cpp.html | 27 +- .../runtime/multimodal_model_runner.html | 27 +- .../tensorrt_llm/runtime/session.html | 27 +- .../tensorrt_llm/sampling_params.html | 27 +- latest/_sources/_cpp_gen/executor.rst.txt | 42 +- latest/_sources/_cpp_gen/runtime.rst.txt | 308 +- .../run-benchmark-with-trtllm-serve.md.txt | 2 +- ...ent-guide-for-deepseek-r1-on-trtllm.md.txt | 2 +- ...loyment-guide-for-gpt-oss-on-trtllm.md.txt | 2 +- ...nt-guide-for-llama3.3-70b-on-trtllm.md.txt | 2 +- ...nt-guide-for-llama4-scout-on-trtllm.md.txt | 2 +- .../developer-guide/api-change.md.txt | 40 +- .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../deepseek_r1_reasoning_parser.rst.txt | 2 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 2 +- .../llm_inference_async_streaming.rst.txt | 2 +- .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_kv_cache_connector.rst.txt | 2 +- .../examples/llm_kv_cache_offloading.rst.txt | 2 +- .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_mgmn_llm_distributed.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 2 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- latest/_sources/examples/llm_runtime.rst.txt | 2 +- latest/_sources/examples/llm_sampling.rst.txt | 2 +- .../examples/llm_sparse_attention.rst.txt | 4 +- .../examples/llm_speculative_decoding.rst.txt | 2 +- .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 2 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 2 +- ...enai_completion_client_json_schema.rst.txt | 2 +- .../_sources/features/multi-modality.md.txt | 2 +- latest/_sources/installation/linux.md.txt | 12 + latest/_sources/llm-api/reference.rst.txt | 4 +- latest/_sources/quick-start-guide.md.txt | 2 +- latest/_static/styles/nvidia-sphinx-theme.css | 2 +- .../styles/nvidia-sphinx-theme.css.map | 2 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 27 +- latest/blogs/Falcon180B-H200.html | 27 +- latest/blogs/H100vsA100.html | 27 +- latest/blogs/H200launch.html | 27 +- latest/blogs/XQA-kernel.html | 27 +- latest/blogs/quantization-in-TRT-LLM.html | 27 +- .../blog10_ADP_Balance_Strategy.html | 27 +- .../tech_blog/blog11_GPT_OSS_Eagle3.html | 27 +- ...ded_Decoding_and_Speculative_Decoding.html | 27 +- ...ompute_Implementation_in_TensorRT-LLM.html | 27 +- ...ert_Parallelism_in_TensorRT-LLM_part3.html | 27 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 27 +- ...1_MTP_Implementation_and_Optimization.html | 27 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 27 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 27 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 27 +- .../blog6_Llama4_maverick_eagle_guide.html | 27 +- ...formance_Analysis_And_Auto_Enablement.html | 27 +- ...ert_Parallelism_in_TensorRT-LLM_part2.html | 27 +- .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 27 +- latest/commands/trtllm-bench.html | 27 +- latest/commands/trtllm-build.html | 27 +- latest/commands/trtllm-eval.html | 27 +- latest/commands/trtllm-serve/index.html | 27 +- .../run-benchmark-with-trtllm-serve.html | 29 +- .../commands/trtllm-serve/trtllm-serve.html | 29 +- ...yment-guide-for-deepseek-r1-on-trtllm.html | 29 +- ...eployment-guide-for-gpt-oss-on-trtllm.html | 29 +- ...ment-guide-for-llama3.3-70b-on-trtllm.html | 29 +- ...ment-guide-for-llama4-scout-on-trtllm.html | 29 +- ...oyment-guide-for-qwen3-next-on-trtllm.html | 27 +- latest/deployment-guide/index.html | 27 +- latest/developer-guide/api-change.html | 80 +- latest/developer-guide/ci-overview.html | 27 +- latest/developer-guide/dev-containers.html | 27 +- latest/developer-guide/kv-transfer.html | 27 +- latest/developer-guide/overview.html | 27 +- latest/developer-guide/perf-analysis.html | 27 +- latest/developer-guide/perf-benchmarking.html | 27 +- latest/developer-guide/perf-overview.html | 31 +- latest/examples/curl_chat_client.html | 29 +- .../curl_chat_client_for_multimodal.html | 29 +- latest/examples/curl_completion_client.html | 29 +- latest/examples/customization.html | 27 +- .../deepseek_r1_reasoning_parser.html | 29 +- latest/examples/dynamo_k8s_example.html | 27 +- latest/examples/genai_perf_client.html | 29 +- .../genai_perf_client_for_multimodal.html | 29 +- latest/examples/index.html | 27 +- latest/examples/kvcacheconfig.html | 27 +- latest/examples/kvcacheretentionconfig.html | 27 +- latest/examples/llm_api_examples.html | 27 +- latest/examples/llm_guided_decoding.html | 29 +- latest/examples/llm_inference.html | 29 +- latest/examples/llm_inference_async.html | 29 +- .../llm_inference_async_streaming.html | 29 +- .../examples/llm_inference_distributed.html | 29 +- latest/examples/llm_kv_cache_connector.html | 29 +- latest/examples/llm_kv_cache_offloading.html | 29 +- latest/examples/llm_logits_processor.html | 29 +- latest/examples/llm_mgmn_llm_distributed.html | 29 +- latest/examples/llm_mgmn_trtllm_bench.html | 29 +- latest/examples/llm_mgmn_trtllm_serve.html | 29 +- latest/examples/llm_multilora.html | 29 +- latest/examples/llm_runtime.html | 29 +- latest/examples/llm_sampling.html | 29 +- latest/examples/llm_sparse_attention.html | 416 +- latest/examples/llm_speculative_decoding.html | 29 +- latest/examples/openai_chat_client.html | 29 +- .../openai_chat_client_for_multimodal.html | 29 +- latest/examples/openai_completion_client.html | 29 +- .../openai_completion_client_for_lora.html | 29 +- .../openai_completion_client_json_schema.html | 29 +- latest/examples/trtllm_serve_examples.html | 27 +- latest/features/additional-outputs.html | 27 +- latest/features/attention.html | 29 +- .../benchmarking_with_trtllm_bench.html | 27 +- .../auto_deploy/advanced/example_run.html | 27 +- .../advanced/expert_configurations.html | 27 +- .../auto_deploy/advanced/logging.html | 27 +- .../auto_deploy/advanced/workflow.html | 27 +- latest/features/auto_deploy/auto-deploy.html | 27 +- .../features/auto_deploy/support_matrix.html | 27 +- latest/features/checkpoint-loading.html | 27 +- latest/features/disagg-serving.html | 27 +- .../features/feature-combination-matrix.html | 27 +- latest/features/kvcache.html | 27 +- latest/features/long-sequence.html | 27 +- latest/features/lora.html | 27 +- latest/features/multi-modality.html | 33 +- latest/features/overlap-scheduler.html | 27 +- .../paged-attention-ifb-scheduler.html | 31 +- latest/features/parallel-strategy.html | 27 +- latest/features/quantization.html | 27 +- latest/features/ray-orchestrator.html | 27 +- latest/features/sampling.html | 37 +- latest/features/speculative-decoding.html | 27 +- ...orch_compile_and_piecewise_cuda_graph.html | 27 +- latest/genindex.html | 147 +- latest/index.html | 27 +- .../installation/build-from-source-linux.html | 29 +- latest/installation/containers.html | 29 +- latest/installation/index.html | 27 +- latest/installation/linux.html | 36 +- .../advanced/disaggregated-service.html | 27 +- latest/legacy/advanced/executor.html | 37 +- .../legacy/advanced/expert-parallelism.html | 27 +- latest/legacy/advanced/gpt-attention.html | 31 +- latest/legacy/advanced/gpt-runtime.html | 27 +- latest/legacy/advanced/graph-rewriting.html | 27 +- .../legacy/advanced/kv-cache-management.html | 27 +- latest/legacy/advanced/kv-cache-reuse.html | 27 +- latest/legacy/advanced/lora.html | 27 +- .../advanced/lowprecision-pcie-allreduce.html | 27 +- .../open-sourced-cutlass-kernels.html | 27 +- .../legacy/advanced/speculative-decoding.html | 27 +- latest/legacy/advanced/weight-streaming.html | 27 +- latest/legacy/architecture/add-model.html | 27 +- latest/legacy/architecture/checkpoint.html | 27 +- latest/legacy/architecture/core-concepts.html | 37 +- .../architecture/model-weights-loader.html | 27 +- latest/legacy/architecture/workflow.html | 27 +- .../build-image-to-dockerhub.html | 27 +- latest/legacy/dev-on-cloud/dev-on-runpod.html | 27 +- latest/legacy/key-features.html | 27 +- latest/legacy/performance/perf-analysis.html | 27 +- .../legacy/performance/perf-benchmarking.html | 27 +- .../benchmarking-default-performance.html | 27 +- .../deciding-model-sharding-strategy.html | 27 +- .../fp8-quantization.html | 27 +- .../performance-tuning-guide/index.html | 27 +- .../introduction.html | 27 +- ...ing-max-batch-size-and-max-num-tokens.html | 27 +- .../useful-build-time-flags.html | 27 +- .../useful-runtime-flags.html | 27 +- .../python-api/tensorrt_llm.functional.html | 27 +- .../python-api/tensorrt_llm.layers.html | 27 +- .../python-api/tensorrt_llm.models.html | 27 +- .../python-api/tensorrt_llm.plugin.html | 27 +- .../python-api/tensorrt_llm.quantization.html | 27 +- .../python-api/tensorrt_llm.runtime.html | 27 +- latest/legacy/reference/memory.html | 31 +- .../multimodal-feature-support-matrix.html | 27 +- latest/legacy/reference/precision.html | 47 +- latest/legacy/reference/support-matrix.html | 27 +- latest/legacy/reference/troubleshooting.html | 27 +- latest/legacy/tensorrt_quickstart.html | 27 +- latest/legacy/torch.html | 27 +- latest/llm-api/index.html | 27 +- latest/llm-api/reference.html | 253 +- latest/models/adding-new-model.html | 27 +- latest/models/supported-models.html | 27 +- latest/objects.inv | Bin 181168 -> 181367 bytes latest/overview.html | 29 +- latest/py-modindex.html | 27 +- latest/quick-start-guide.html | 29 +- latest/release-notes.html | 27 +- latest/search.html | 27 +- latest/searchindex.js | 2 +- latest/torch/adding_new_model.html | 27 +- latest/torch/arch_overview.html | 27 +- latest/torch/attention.html | 27 +- .../benchmarking_with_trtllm_bench.html | 27 +- .../auto_deploy/advanced/example_run.html | 27 +- .../advanced/expert_configurations.html | 27 +- .../torch/auto_deploy/advanced/logging.html | 27 +- .../advanced/serving_with_trtllm_serve.html | 27 +- .../torch/auto_deploy/advanced/workflow.html | 27 +- latest/torch/auto_deploy/auto-deploy.html | 27 +- latest/torch/auto_deploy/support_matrix.html | 27 +- latest/torch/features/checkpoint_loading.html | 27 +- latest/torch/features/lora.html | 27 +- latest/torch/features/overlap_scheduler.html | 27 +- latest/torch/features/quantization.html | 27 +- latest/torch/features/sampling.html | 27 +- latest/torch/kv_cache_manager.html | 27 +- latest/torch/scheduler.html | 27 +- 296 files changed, 18530 insertions(+), 16483 deletions(-) diff --git a/latest/.buildinfo b/latest/.buildinfo index 1f24c7367c..e1872b4a6c 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: a9c5c8c57021602368f541d74d22523d +config: 5b10b2153627779ea5be4dbb07d82396 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index 182be9c38a..824e5f6543 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -32,7 +32,7 @@ - + @@ -44,6 +44,8 @@ + + @@ -59,20 +61,24 @@ + + + - + +
@@ -80,6 +86,8 @@ + + @@ -506,8 +514,8 @@Public Functions
+Constructs a DisaggExecutorOrchestrator object.
+ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
+requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
+requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
+timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
+Await for generation responses.
+timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
+Indicates if the current process is allowed to enqueueRequests.
+Get context executors.
+Get generation executors.
+Private Members
+Public Functions
+Public Types
+Public Types
+Public Functions
+Returns a pointer to underlying array.
+Returns a pointer to underlying array.
+Returns the memory type of the buffer.
+Returns the number of elements in the tensor.
+Returns the size of the tensor in bytes.
+Set the entire memory to zero.
+stream – Must be a valid CUDA stream if the memory type is GPU.
+Copy the data and shape from another tensor.
+other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
+Allocate a cpu tensor with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
+shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
+shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
+Typedefs
+Public Static Functions
+Public Functions
-Public Functions
-Public Functions
-Constructs a DisaggExecutorOrchestrator object.
-ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
-requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
-requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
-timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
-Await for generation responses.
-timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
-Indicates if the current process is allowed to enqueueRequests.
-Get context executors.
-Get generation executors.
-Private Members
-Public Functions
-Public Types
-Public Types
-Public Functions
-Returns a pointer to underlying array.
-Returns a pointer to underlying array.
-Returns the memory type of the buffer.
-Returns the number of elements in the tensor.
-Returns the size of the tensor in bytes.
-Set the entire memory to zero.
-stream – Must be a valid CUDA stream if the memory type is GPU.
-Copy the data and shape from another tensor.
-other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.
Public Static Functions
-Allocate a cpu tensor with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a cpu tensor in pooled pinned memory with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a tensor in managed memory (UVM) with the given shape and data type.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
Allocate a gpu tensor with the given shape and data type on a particular cuda stream.
-shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap a data pointer into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Wrap any container into a tensor without taking ownership.
-shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
Private Functions
-Public Functions
Public Functions
Friends
@@ -10263,1736 +11937,131 @@ -Public Static Functions
+Public Functions
Public Functions
+In order to prepare a dataset, you can use the provided script. +
In order to prepare a dataset, you can use the provided script. To generate a synthetic dataset, run the following command:
python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file
To run the benchmark with the generated data set, simply use the trtllm-bench throughput subcommand. The benchmarker will
run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide
-a model name (HuggingFace reference or path to a local model), a generated dataset, and a file containing any desired extra options to the LLM APIs (details in tensorrt_llm/llmapi/llm_args.py:LlmArgs).
For dense / non-MoE models:
trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
--kv_cach
You can find a more detailed example on logits processors here.
+You can find a more detailed example on logits processors here.