From 72a4b6677ee31964363a11599ddc9960887c4179 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Wed, 22 Oct 2025 01:55:44 +0000 Subject: [PATCH] Update latest GitHub pages to v1.2.0rc1 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 14 +- latest/_cpp_gen/runtime.html | 14 +- .../attention.py | 61 +- .../model_engine.py | 754 ++++--- latest/_modules/index.html | 12 +- latest/_modules/tensorrt_llm/builder.html | 19 +- .../tensorrt_llm/disaggregated_params.html | 12 +- .../tensorrt_llm/executor/request.html | 12 +- .../tensorrt_llm/executor/result.html | 191 +- .../_modules/tensorrt_llm/executor/utils.html | 12 +- latest/_modules/tensorrt_llm/functional.html | 12 +- .../tensorrt_llm/layers/activation.html | 12 +- .../tensorrt_llm/layers/attention.html | 12 +- latest/_modules/tensorrt_llm/layers/cast.html | 12 +- latest/_modules/tensorrt_llm/layers/conv.html | 12 +- .../tensorrt_llm/layers/embedding.html | 12 +- .../_modules/tensorrt_llm/layers/linear.html | 12 +- latest/_modules/tensorrt_llm/layers/mlp.html | 12 +- .../tensorrt_llm/layers/normalization.html | 12 +- .../_modules/tensorrt_llm/layers/pooling.html | 12 +- .../tensorrt_llm/llmapi/build_cache.html | 12 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 64 +- .../tensorrt_llm/llmapi/llm_args.html | 213 +- .../tensorrt_llm/llmapi/mm_encoder.html | 12 +- .../tensorrt_llm/llmapi/mpi_session.html | 12 +- .../tensorrt_llm/models/baichuan/model.html | 14 +- .../tensorrt_llm/models/bert/model.html | 12 +- .../tensorrt_llm/models/bloom/model.html | 12 +- .../tensorrt_llm/models/chatglm/config.html | 12 +- .../tensorrt_llm/models/chatglm/model.html | 14 +- .../tensorrt_llm/models/clip/model.html | 12 +- .../tensorrt_llm/models/cogvlm/config.html | 12 +- .../tensorrt_llm/models/cogvlm/model.html | 12 +- .../tensorrt_llm/models/commandr/model.html | 12 +- .../tensorrt_llm/models/dbrx/config.html | 12 +- .../tensorrt_llm/models/dbrx/model.html | 12 +- .../models/deepseek_v1/model.html | 12 +- .../models/deepseek_v2/model.html | 12 +- .../tensorrt_llm/models/dit/model.html | 12 +- .../tensorrt_llm/models/eagle/model.html | 12 +- .../tensorrt_llm/models/enc_dec/model.html | 12 +- .../tensorrt_llm/models/falcon/config.html | 12 +- .../tensorrt_llm/models/falcon/model.html | 14 +- .../tensorrt_llm/models/gemma/config.html | 12 +- .../tensorrt_llm/models/gemma/model.html | 14 +- .../tensorrt_llm/models/gpt/config.html | 12 +- .../tensorrt_llm/models/gpt/model.html | 12 +- .../tensorrt_llm/models/gptj/config.html | 12 +- .../tensorrt_llm/models/gptj/model.html | 16 +- .../tensorrt_llm/models/gptneox/model.html | 12 +- .../tensorrt_llm/models/llama/config.html | 12 +- .../tensorrt_llm/models/llama/model.html | 12 +- .../tensorrt_llm/models/mamba/model.html | 14 +- .../tensorrt_llm/models/medusa/config.html | 12 +- .../tensorrt_llm/models/medusa/model.html | 14 +- .../tensorrt_llm/models/mllama/model.html | 12 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 12 +- .../tensorrt_llm/models/modeling_utils.html | 15 +- .../tensorrt_llm/models/mpt/model.html | 12 +- .../models/multimodal_encoders/config.html | 12 +- .../models/multimodal_encoders/model.html | 12 +- .../tensorrt_llm/models/opt/model.html | 12 +- .../tensorrt_llm/models/phi/model.html | 16 +- .../tensorrt_llm/models/phi3/model.html | 16 +- .../models/recurrentgemma/model.html | 12 +- .../tensorrt_llm/models/redrafter/model.html | 12 +- .../_modules/tensorrt_llm/plugin/plugin.html | 662 +++--- .../tensorrt_llm/quantization/mode.html | 12 +- .../quantization/quantize_by_modelopt.html | 20 +- .../runtime/enc_dec_model_runner.html | 24 +- .../tensorrt_llm/runtime/generation.html | 55 +- .../runtime/kv_cache_manager.html | 12 +- .../tensorrt_llm/runtime/model_runner.html | 20 +- .../runtime/model_runner_cpp.html | 24 +- .../runtime/multimodal_model_runner.html | 34 +- .../tensorrt_llm/runtime/session.html | 12 +- .../tensorrt_llm/sampling_params.html | 89 +- ...t_Parallelism_in_TensorRT-LLM_part3.md.txt | 239 +++ .../run-benchmark-with-trtllm-serve.md.txt | 82 +- .../_sources/deployment-guide/index.rst.txt | 1 + ...rt-recipe-for-deepseek-r1-on-trtllm.md.txt | 66 +- ...-start-recipe-for-gpt-oss-on-trtllm.md.txt | 63 +- ...t-recipe-for-llama3.3-70b-on-trtllm.md.txt | 103 +- ...t-recipe-for-llama4-scout-on-trtllm.md.txt | 59 +- ...art-recipe-for-qwen3-next-on-trtllm.md.txt | 237 +++ .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../deepseek_r1_reasoning_parser.rst.txt | 2 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- .../examples/llm_api_examples.rst.txt | 1 + .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 2 +- .../llm_inference_async_streaming.rst.txt | 2 +- .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_kv_cache_connector.rst.txt | 2 +- .../examples/llm_kv_cache_offloading.rst.txt | 2 +- .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_mgmn_llm_distributed.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 2 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- latest/_sources/examples/llm_runtime.rst.txt | 2 +- latest/_sources/examples/llm_sampling.rst.txt | 2 +- .../examples/llm_sparse_attention.rst.txt | 8 + .../examples/llm_speculative_decoding.rst.txt | 2 +- .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 2 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 2 +- ...enai_completion_client_json_schema.rst.txt | 2 +- .../benchmarking_with_trtllm_bench.md.txt | 32 +- .../advanced/expert_configurations.md.txt | 44 +- .../auto_deploy/advanced/workflow.md.txt | 2 - .../auto_deploy/support_matrix.md.txt | 1 + .../_sources/features/disagg-serving.md.txt | 2 +- .../feature-combination-matrix.md.txt | 4 +- .../_sources/features/ray-orchestrator.md.txt | 42 + latest/_sources/index.rst.txt | 1 + .../build-from-source-linux.md.txt | 5 + latest/_sources/installation/linux.md.txt | 8 + latest/_sources/llm-api/reference.rst.txt | 20 +- .../_sources/models/supported-models.md.txt | 3 +- latest/_sources/quick-start-guide.md.txt | 2 +- .../benchmarking_with_trtllm_bench.md.txt | 32 +- .../advanced/expert_configurations.md.txt | 44 +- .../advanced/serving_with_trtllm_serve.md.txt | 34 +- .../auto_deploy/advanced/workflow.md.txt | 19 +- .../feature_combination_matrix.md.txt | 18 - ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 12 +- latest/blogs/Falcon180B-H200.html | 12 +- latest/blogs/H100vsA100.html | 12 +- latest/blogs/H200launch.html | 12 +- latest/blogs/XQA-kernel.html | 12 +- latest/blogs/quantization-in-TRT-LLM.html | 12 +- .../blog10_ADP_Balance_Strategy.html | 12 +- .../tech_blog/blog11_GPT_OSS_Eagle3.html | 12 +- ...ded_Decoding_and_Speculative_Decoding.html | 12 +- ...ompute_Implementation_in_TensorRT-LLM.html | 18 +- ...ert_Parallelism_in_TensorRT-LLM_part3.html | 909 ++++++++ ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 18 +- ...1_MTP_Implementation_and_Optimization.html | 12 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 12 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 12 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 12 +- .../blog6_Llama4_maverick_eagle_guide.html | 12 +- ...formance_Analysis_And_Auto_Enablement.html | 12 +- ...ert_Parallelism_in_TensorRT-LLM_part2.html | 12 +- .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 12 +- latest/commands/trtllm-bench.html | 12 +- latest/commands/trtllm-build.html | 60 +- latest/commands/trtllm-eval.html | 48 +- latest/commands/trtllm-serve/index.html | 12 +- .../run-benchmark-with-trtllm-serve.html | 119 +- .../commands/trtllm-serve/trtllm-serve.html | 18 +- latest/deployment-guide/index.html | 13 +- ...tart-recipe-for-deepseek-r1-on-trtllm.html | 109 +- ...ck-start-recipe-for-gpt-oss-on-trtllm.html | 115 +- ...art-recipe-for-llama3.3-70b-on-trtllm.html | 117 +- ...art-recipe-for-llama4-scout-on-trtllm.html | 105 +- ...start-recipe-for-qwen3-next-on-trtllm.html | 937 +++++++++ latest/developer-guide/api-change.html | 12 +- latest/developer-guide/ci-overview.html | 12 +- latest/developer-guide/dev-containers.html | 12 +- latest/developer-guide/overview.html | 18 +- latest/developer-guide/perf-analysis.html | 12 +- latest/developer-guide/perf-benchmarking.html | 12 +- latest/developer-guide/perf-overview.html | 16 +- latest/examples/curl_chat_client.html | 14 +- .../curl_chat_client_for_multimodal.html | 14 +- latest/examples/curl_completion_client.html | 14 +- latest/examples/customization.html | 12 +- .../deepseek_r1_reasoning_parser.html | 14 +- latest/examples/dynamo_k8s_example.html | 12 +- latest/examples/genai_perf_client.html | 14 +- .../genai_perf_client_for_multimodal.html | 14 +- latest/examples/index.html | 12 +- latest/examples/kvcacheconfig.html | 12 +- latest/examples/kvcacheretentionconfig.html | 12 +- latest/examples/llm_api_examples.html | 13 +- latest/examples/llm_guided_decoding.html | 14 +- latest/examples/llm_inference.html | 14 +- latest/examples/llm_inference_async.html | 14 +- .../llm_inference_async_streaming.html | 14 +- .../examples/llm_inference_distributed.html | 14 +- latest/examples/llm_kv_cache_connector.html | 14 +- latest/examples/llm_kv_cache_offloading.html | 14 +- latest/examples/llm_logits_processor.html | 14 +- latest/examples/llm_mgmn_llm_distributed.html | 14 +- latest/examples/llm_mgmn_trtllm_bench.html | 14 +- latest/examples/llm_mgmn_trtllm_serve.html | 14 +- latest/examples/llm_multilora.html | 20 +- latest/examples/llm_runtime.html | 14 +- latest/examples/llm_sampling.html | 14 +- latest/examples/llm_sparse_attention.html | 806 ++++++++ latest/examples/llm_speculative_decoding.html | 20 +- latest/examples/openai_chat_client.html | 14 +- .../openai_chat_client_for_multimodal.html | 14 +- latest/examples/openai_completion_client.html | 14 +- .../openai_completion_client_for_lora.html | 14 +- .../openai_completion_client_json_schema.html | 14 +- latest/examples/trtllm_serve_examples.html | 12 +- latest/features/attention.html | 14 +- .../benchmarking_with_trtllm_bench.html | 44 +- .../auto_deploy/advanced/example_run.html | 12 +- .../advanced/expert_configurations.html | 56 +- .../auto_deploy/advanced/logging.html | 12 +- .../auto_deploy/advanced/workflow.html | 14 +- latest/features/auto_deploy/auto-deploy.html | 18 +- .../features/auto_deploy/support_matrix.html | 19 +- latest/features/checkpoint-loading.html | 12 +- latest/features/disagg-serving.html | 16 +- .../features/feature-combination-matrix.html | 18 +- latest/features/kvcache.html | 12 +- latest/features/long-sequence.html | 12 +- latest/features/lora.html | 12 +- latest/features/multi-modality.html | 16 +- latest/features/overlap-scheduler.html | 12 +- .../paged-attention-ifb-scheduler.html | 16 +- latest/features/parallel-strategy.html | 12 +- latest/features/quantization.html | 12 +- latest/features/ray-orchestrator.html | 715 +++++++ latest/features/sampling.html | 22 +- latest/features/speculative-decoding.html | 12 +- latest/genindex.html | 494 ++++- latest/index.html | 33 +- .../installation/build-from-source-linux.html | 18 +- latest/installation/containers.html | 14 +- latest/installation/index.html | 12 +- latest/installation/linux.html | 19 +- .../advanced/disaggregated-service.html | 12 +- latest/legacy/advanced/executor.html | 22 +- .../legacy/advanced/expert-parallelism.html | 12 +- latest/legacy/advanced/gpt-attention.html | 16 +- latest/legacy/advanced/gpt-runtime.html | 12 +- latest/legacy/advanced/graph-rewriting.html | 12 +- .../legacy/advanced/kv-cache-management.html | 12 +- latest/legacy/advanced/kv-cache-reuse.html | 12 +- latest/legacy/advanced/lora.html | 12 +- .../advanced/lowprecision-pcie-allreduce.html | 12 +- .../open-sourced-cutlass-kernels.html | 12 +- .../legacy/advanced/speculative-decoding.html | 12 +- latest/legacy/advanced/weight-streaming.html | 12 +- latest/legacy/architecture/add-model.html | 12 +- latest/legacy/architecture/checkpoint.html | 12 +- latest/legacy/architecture/core-concepts.html | 22 +- .../architecture/model-weights-loader.html | 12 +- latest/legacy/architecture/workflow.html | 12 +- .../build-image-to-dockerhub.html | 12 +- latest/legacy/dev-on-cloud/dev-on-runpod.html | 12 +- latest/legacy/key-features.html | 12 +- latest/legacy/performance/perf-analysis.html | 12 +- .../legacy/performance/perf-benchmarking.html | 12 +- .../benchmarking-default-performance.html | 12 +- .../deciding-model-sharding-strategy.html | 12 +- .../fp8-quantization.html | 12 +- .../performance-tuning-guide/index.html | 12 +- .../introduction.html | 12 +- ...ing-max-batch-size-and-max-num-tokens.html | 12 +- .../useful-build-time-flags.html | 12 +- .../useful-runtime-flags.html | 12 +- .../python-api/tensorrt_llm.functional.html | 12 +- .../python-api/tensorrt_llm.layers.html | 40 +- .../python-api/tensorrt_llm.models.html | 22 +- .../python-api/tensorrt_llm.plugin.html | 998 ++++++++- .../python-api/tensorrt_llm.quantization.html | 12 +- .../python-api/tensorrt_llm.runtime.html | 27 +- latest/legacy/reference/memory.html | 16 +- .../multimodal-feature-support-matrix.html | 12 +- latest/legacy/reference/precision.html | 32 +- latest/legacy/reference/support-matrix.html | 12 +- latest/legacy/reference/troubleshooting.html | 12 +- latest/legacy/tensorrt_quickstart.html | 12 +- latest/legacy/torch.html | 12 +- latest/llm-api/index.html | 12 +- latest/llm-api/reference.html | 1827 ++++++++++++++++- latest/models/adding-new-model.html | 12 +- latest/models/supported-models.html | 42 +- latest/objects.inv | Bin 177079 -> 179353 bytes latest/overview.html | 14 +- latest/py-modindex.html | 12 +- latest/quick-start-guide.html | 14 +- latest/release-notes.html | 12 +- latest/search.html | 12 +- latest/searchindex.js | 2 +- latest/torch/adding_new_model.html | 12 +- latest/torch/arch_overview.html | 12 +- latest/torch/attention.html | 12 +- .../benchmarking_with_trtllm_bench.html | 44 +- .../auto_deploy/advanced/example_run.html | 12 +- .../advanced/expert_configurations.html | 56 +- .../torch/auto_deploy/advanced/logging.html | 12 +- .../advanced/serving_with_trtllm_serve.html | 46 +- .../torch/auto_deploy/advanced/workflow.html | 31 +- latest/torch/auto_deploy/auto-deploy.html | 12 +- latest/torch/auto_deploy/support_matrix.html | 12 +- latest/torch/features/checkpoint_loading.html | 12 +- .../features/feature_combination_matrix.html | 871 -------- latest/torch/features/lora.html | 12 +- latest/torch/features/overlap_scheduler.html | 12 +- latest/torch/features/quantization.html | 12 +- latest/torch/features/sampling.html | 12 +- latest/torch/kv_cache_manager.html | 12 +- latest/torch/scheduler.html | 12 +- 307 files changed, 11467 insertions(+), 3419 deletions(-) create mode 100644 latest/_sources/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md.txt create mode 100644 latest/_sources/deployment-guide/quick-start-recipe-for-qwen3-next-on-trtllm.md.txt create mode 100644 latest/_sources/examples/llm_sparse_attention.rst.txt create mode 100644 latest/_sources/features/ray-orchestrator.md.txt delete mode 100644 latest/_sources/torch/features/feature_combination_matrix.md.txt create mode 100644 latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html create mode 100644 latest/deployment-guide/quick-start-recipe-for-qwen3-next-on-trtllm.html create mode 100644 latest/examples/llm_sparse_attention.html create mode 100644 latest/features/ray-orchestrator.html delete mode 100644 latest/torch/features/feature_combination_matrix.html diff --git a/latest/.buildinfo b/latest/.buildinfo index d02654bf0d..3d8a1f8e70 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 05441684cb2c0903bdac9ebb5abe267d +config: eb18464cd19c763f9cb542fdd6f60977 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index 87eb7fb4c0..2a119ff57e 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -59,7 +59,7 @@ @@ -69,7 +69,7 @@ - + @@ -335,6 +335,7 @@
  • Generate text with guided decoding
  • Control generated text using logits processor
  • Generate text with multiple LoRA adapters
  • +
  • Sparse Attention
  • Speculative Decoding
  • KV Cache Connector
  • KV Cache Offloading
  • @@ -365,6 +366,7 @@
  • Quick Start Recipe for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Quick Start Recipe for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Quick Start Recipe for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Quick Start Recipe for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • @@ -407,6 +409,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -407,6 +409,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -402,6 +404,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -407,6 +409,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -407,6 +409,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -411,6 +413,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -411,6 +413,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -411,6 +413,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -411,6 +413,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -411,6 +413,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -407,6 +409,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -405,6 +407,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -405,6 +407,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide

    @@ -636,9 +641,9 @@ diff --git a/latest/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.html b/latest/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.html index ec286e47fa..c36acd4426 100644 --- a/latest/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.html +++ b/latest/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -337,6 +337,7 @@
  • Generate text with guided decoding
  • Control generated text using logits processor
  • Generate text with multiple LoRA adapters
  • +
  • Sparse Attention
  • Speculative Decoding
  • KV Cache Connector
  • KV Cache Offloading
  • @@ -367,6 +368,7 @@
  • Quick Start Recipe for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Quick Start Recipe for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Quick Start Recipe for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Quick Start Recipe for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • @@ -409,6 +411,7 @@
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • +
  • Ray Orchestrator (Prototype)
  • Developer Guide