From 0137c0e12a3f010a3932813e1c83a97f2c3972d1 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Wed, 10 Dec 2025 03:07:22 +0000 Subject: [PATCH] Update latest GitHub pages to v1.2.0rc5 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 6081 ++--- latest/_cpp_gen/runtime.html | 19591 ++++++++-------- .../attention.py | 125 +- .../model_engine.py | 181 +- latest/_modules/index.html | 13 +- latest/_modules/tensorrt_llm/builder.html | 13 +- .../tensorrt_llm/disaggregated_params.html | 13 +- .../tensorrt_llm/executor/request.html | 13 +- .../tensorrt_llm/executor/result.html | 160 +- .../_modules/tensorrt_llm/executor/utils.html | 13 +- latest/_modules/tensorrt_llm/functional.html | 15 +- .../tensorrt_llm/layers/activation.html | 13 +- .../tensorrt_llm/layers/attention.html | 13 +- latest/_modules/tensorrt_llm/layers/cast.html | 13 +- latest/_modules/tensorrt_llm/layers/conv.html | 13 +- .../tensorrt_llm/layers/embedding.html | 13 +- .../_modules/tensorrt_llm/layers/linear.html | 13 +- latest/_modules/tensorrt_llm/layers/mlp.html | 13 +- .../tensorrt_llm/layers/normalization.html | 13 +- .../_modules/tensorrt_llm/layers/pooling.html | 13 +- .../tensorrt_llm/llmapi/build_cache.html | 13 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 57 +- .../tensorrt_llm/llmapi/llm_args.html | 753 +- .../tensorrt_llm/llmapi/mm_encoder.html | 13 +- .../tensorrt_llm/llmapi/mpi_session.html | 20 +- .../tensorrt_llm/models/baichuan/model.html | 13 +- .../tensorrt_llm/models/bert/model.html | 13 +- .../tensorrt_llm/models/bloom/model.html | 13 +- .../tensorrt_llm/models/chatglm/config.html | 13 +- .../tensorrt_llm/models/chatglm/model.html | 13 +- .../tensorrt_llm/models/clip/model.html | 13 +- .../tensorrt_llm/models/cogvlm/config.html | 13 +- .../tensorrt_llm/models/cogvlm/model.html | 13 +- .../tensorrt_llm/models/commandr/model.html | 13 +- .../tensorrt_llm/models/dbrx/config.html | 13 +- .../tensorrt_llm/models/dbrx/model.html | 13 +- .../models/deepseek_v1/model.html | 13 +- .../models/deepseek_v2/model.html | 13 +- .../tensorrt_llm/models/dit/model.html | 13 +- .../tensorrt_llm/models/eagle/model.html | 13 +- .../tensorrt_llm/models/enc_dec/model.html | 13 +- .../tensorrt_llm/models/falcon/config.html | 13 +- .../tensorrt_llm/models/falcon/model.html | 13 +- .../tensorrt_llm/models/gemma/config.html | 13 +- .../tensorrt_llm/models/gemma/model.html | 13 +- .../tensorrt_llm/models/gpt/config.html | 13 +- .../tensorrt_llm/models/gpt/model.html | 13 +- .../tensorrt_llm/models/gptj/config.html | 13 +- .../tensorrt_llm/models/gptj/model.html | 13 +- .../tensorrt_llm/models/gptneox/model.html | 13 +- .../tensorrt_llm/models/llama/config.html | 13 +- .../tensorrt_llm/models/llama/model.html | 13 +- .../tensorrt_llm/models/mamba/model.html | 13 +- .../tensorrt_llm/models/medusa/config.html | 13 +- .../tensorrt_llm/models/medusa/model.html | 13 +- .../tensorrt_llm/models/mllama/model.html | 13 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 13 +- .../tensorrt_llm/models/modeling_utils.html | 13 +- .../tensorrt_llm/models/mpt/model.html | 13 +- .../models/multimodal_encoders/config.html | 13 +- .../models/multimodal_encoders/model.html | 13 +- .../tensorrt_llm/models/opt/model.html | 13 +- .../tensorrt_llm/models/phi/model.html | 13 +- .../tensorrt_llm/models/phi3/model.html | 13 +- .../models/recurrentgemma/model.html | 13 +- .../tensorrt_llm/models/redrafter/model.html | 13 +- .../_modules/tensorrt_llm/plugin/plugin.html | 50 +- .../tensorrt_llm/quantization/mode.html | 17 +- .../quantization/quantize_by_modelopt.html | 13 +- .../runtime/enc_dec_model_runner.html | 13 +- .../tensorrt_llm/runtime/generation.html | 13 +- .../runtime/kv_cache_manager.html | 13 +- .../tensorrt_llm/runtime/model_runner.html | 13 +- .../runtime/model_runner_cpp.html | 13 +- .../runtime/multimodal_model_runner.html | 13 +- .../tensorrt_llm/runtime/session.html | 13 +- .../tensorrt_llm/sampling_params.html | 32 +- latest/_sources/_cpp_gen/executor.rst.txt | 42 +- latest/_sources/_cpp_gen/runtime.rst.txt | 328 +- latest/_sources/blogs/H100vsA100.md.txt | 2 +- latest/_sources/blogs/H200launch.md.txt | 2 +- ...saggregated_Serving_in_TensorRT-LLM.md.txt | 2 +- .../run-benchmark-with-trtllm-serve.md.txt | 32 +- .../trtllm-serve/trtllm-serve.rst.txt | 28 +- ...ent-guide-for-deepseek-r1-on-trtllm.md.txt | 34 +- ...loyment-guide-for-gpt-oss-on-trtllm.md.txt | 34 +- ...uide-for-kimi-k2-thinking-on-trtllm.md.txt | 308 + ...nt-guide-for-llama3.3-70b-on-trtllm.md.txt | 32 +- ...nt-guide-for-llama4-scout-on-trtllm.md.txt | 32 +- ...eployment-guide-for-qwen3-on-trtllm.md.txt | 256 + .../_sources/deployment-guide/index.rst.txt | 2 + .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../deepseek_r1_reasoning_parser.rst.txt | 4 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 2 +- .../llm_inference_async_streaming.rst.txt | 2 +- .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_kv_cache_connector.rst.txt | 4 +- .../examples/llm_kv_cache_offloading.rst.txt | 2 +- .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_mgmn_llm_distributed.rst.txt | 4 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 4 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 4 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- latest/_sources/examples/llm_runtime.rst.txt | 4 +- latest/_sources/examples/llm_sampling.rst.txt | 4 +- .../examples/llm_sparse_attention.rst.txt | 4 +- .../examples/llm_speculative_decoding.rst.txt | 2 +- .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 2 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 2 +- ...enai_completion_client_json_schema.rst.txt | 2 +- .../auto_deploy/support_matrix.md.txt | 1 + .../features/checkpoint-loading.md.txt | 7 +- .../_sources/features/disagg-serving.md.txt | 2 +- .../_sources/features/guided-decoding.md.txt | 583 + latest/_sources/features/helix.md.txt | 82 + .../features/kv-cache-connector.md.txt | 113 + .../features/parallel-strategy.md.txt | 2 + latest/_sources/features/sampling.md.txt | 38 +- latest/_sources/index.rst.txt | 4 + latest/_sources/installation/linux.md.txt | 7 +- .../legacy/advanced/kv-cache-reuse.md.txt | 2 +- .../multimodal-feature-support-matrix.md.txt | 4 +- latest/_sources/llm-api/reference.rst.txt | 4 +- .../_sources/models/supported-models.md.txt | 8 +- latest/_sources/overview.md.txt | 22 +- latest/_sources/quick-start-guide.md.txt | 11 +- .../advanced/expert_configurations.md.txt | 79 + ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 13 +- latest/blogs/Falcon180B-H200.html | 13 +- latest/blogs/H100vsA100.html | 15 +- latest/blogs/H200launch.html | 15 +- latest/blogs/XQA-kernel.html | 13 +- latest/blogs/quantization-in-TRT-LLM.html | 13 +- .../blog10_ADP_Balance_Strategy.html | 13 +- .../tech_blog/blog11_GPT_OSS_Eagle3.html | 13 +- ...ded_Decoding_and_Speculative_Decoding.html | 13 +- ...ompute_Implementation_in_TensorRT-LLM.html | 13 +- ...ert_Parallelism_in_TensorRT-LLM_part3.html | 13 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 13 +- ...1_MTP_Implementation_and_Optimization.html | 13 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 13 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 13 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 15 +- .../blog6_Llama4_maverick_eagle_guide.html | 13 +- ...formance_Analysis_And_Auto_Enablement.html | 13 +- ...ert_Parallelism_in_TensorRT-LLM_part2.html | 13 +- .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 13 +- latest/commands/trtllm-bench.html | 35 +- latest/commands/trtllm-build.html | 13 +- latest/commands/trtllm-eval.html | 43 +- latest/commands/trtllm-serve/index.html | 15 +- .../run-benchmark-with-trtllm-serve.html | 52 +- .../commands/trtllm-serve/trtllm-serve.html | 231 +- ...yment-guide-for-deepseek-r1-on-trtllm.html | 54 +- ...eployment-guide-for-gpt-oss-on-trtllm.html | 59 +- ...-guide-for-kimi-k2-thinking-on-trtllm.html | 982 + ...ment-guide-for-llama3.3-70b-on-trtllm.html | 52 +- ...ment-guide-for-llama4-scout-on-trtllm.html | 52 +- ...oyment-guide-for-qwen3-next-on-trtllm.html | 25 +- .../deployment-guide-for-qwen3-on-trtllm.html | 985 + latest/deployment-guide/index.html | 17 +- latest/developer-guide/api-change.html | 13 +- latest/developer-guide/ci-overview.html | 13 +- latest/developer-guide/dev-containers.html | 13 +- latest/developer-guide/kv-transfer.html | 13 +- latest/developer-guide/overview.html | 19 +- latest/developer-guide/perf-analysis.html | 13 +- latest/developer-guide/perf-benchmarking.html | 13 +- latest/developer-guide/perf-overview.html | 17 +- latest/examples/curl_chat_client.html | 15 +- .../curl_chat_client_for_multimodal.html | 15 +- latest/examples/curl_completion_client.html | 15 +- latest/examples/customization.html | 13 +- .../deepseek_r1_reasoning_parser.html | 44 +- latest/examples/dynamo_k8s_example.html | 13 +- latest/examples/genai_perf_client.html | 15 +- .../genai_perf_client_for_multimodal.html | 15 +- latest/examples/index.html | 13 +- latest/examples/kvcacheconfig.html | 13 +- latest/examples/kvcacheretentionconfig.html | 13 +- latest/examples/llm_api_examples.html | 13 +- latest/examples/llm_guided_decoding.html | 15 +- latest/examples/llm_inference.html | 15 +- latest/examples/llm_inference_async.html | 15 +- .../llm_inference_async_streaming.html | 15 +- .../examples/llm_inference_distributed.html | 15 +- latest/examples/llm_kv_cache_connector.html | 540 +- latest/examples/llm_kv_cache_offloading.html | 15 +- latest/examples/llm_logits_processor.html | 15 +- latest/examples/llm_mgmn_llm_distributed.html | 138 +- latest/examples/llm_mgmn_trtllm_bench.html | 235 +- latest/examples/llm_mgmn_trtllm_serve.html | 135 +- latest/examples/llm_multilora.html | 15 +- latest/examples/llm_runtime.html | 249 +- latest/examples/llm_sampling.html | 206 +- latest/examples/llm_sparse_attention.html | 359 +- latest/examples/llm_speculative_decoding.html | 15 +- latest/examples/openai_chat_client.html | 15 +- .../openai_chat_client_for_multimodal.html | 15 +- latest/examples/openai_completion_client.html | 15 +- .../openai_completion_client_for_lora.html | 15 +- .../openai_completion_client_json_schema.html | 15 +- latest/examples/trtllm_serve_examples.html | 13 +- latest/features/additional-outputs.html | 19 +- latest/features/attention.html | 15 +- .../benchmarking_with_trtllm_bench.html | 13 +- .../auto_deploy/advanced/example_run.html | 13 +- .../advanced/expert_configurations.html | 13 +- .../auto_deploy/advanced/logging.html | 13 +- .../auto_deploy/advanced/workflow.html | 13 +- latest/features/auto_deploy/auto-deploy.html | 13 +- .../features/auto_deploy/support_matrix.html | 14 +- latest/features/checkpoint-loading.html | 20 +- latest/features/disagg-serving.html | 15 +- .../features/feature-combination-matrix.html | 13 +- latest/features/guided-decoding.html | 1250 + latest/features/helix.html | 792 + latest/features/kv-cache-connector.html | 835 + latest/features/kvcache.html | 13 +- latest/features/long-sequence.html | 13 +- latest/features/lora.html | 13 +- latest/features/multi-modality.html | 17 +- latest/features/overlap-scheduler.html | 13 +- .../paged-attention-ifb-scheduler.html | 17 +- latest/features/parallel-strategy.html | 14 +- latest/features/quantization.html | 13 +- latest/features/ray-orchestrator.html | 13 +- latest/features/sampling.html | 59 +- latest/features/speculative-decoding.html | 19 +- ...orch_compile_and_piecewise_cuda_graph.html | 19 +- latest/genindex.html | 267 +- latest/index.html | 33 +- .../installation/build-from-source-linux.html | 15 +- latest/installation/containers.html | 15 +- latest/installation/index.html | 13 +- latest/installation/linux.html | 19 +- .../advanced/disaggregated-service.html | 13 +- latest/legacy/advanced/executor.html | 23 +- .../legacy/advanced/expert-parallelism.html | 13 +- latest/legacy/advanced/gpt-attention.html | 17 +- latest/legacy/advanced/gpt-runtime.html | 13 +- latest/legacy/advanced/graph-rewriting.html | 13 +- .../legacy/advanced/kv-cache-management.html | 13 +- latest/legacy/advanced/kv-cache-reuse.html | 15 +- latest/legacy/advanced/lora.html | 13 +- .../advanced/lowprecision-pcie-allreduce.html | 13 +- .../open-sourced-cutlass-kernels.html | 13 +- .../legacy/advanced/speculative-decoding.html | 13 +- latest/legacy/advanced/weight-streaming.html | 13 +- latest/legacy/architecture/add-model.html | 13 +- latest/legacy/architecture/checkpoint.html | 13 +- latest/legacy/architecture/core-concepts.html | 23 +- .../architecture/model-weights-loader.html | 13 +- latest/legacy/architecture/workflow.html | 13 +- .../build-image-to-dockerhub.html | 13 +- latest/legacy/dev-on-cloud/dev-on-runpod.html | 13 +- latest/legacy/key-features.html | 13 +- latest/legacy/performance/perf-analysis.html | 13 +- .../legacy/performance/perf-benchmarking.html | 13 +- .../benchmarking-default-performance.html | 13 +- .../deciding-model-sharding-strategy.html | 13 +- .../fp8-quantization.html | 13 +- .../performance-tuning-guide/index.html | 13 +- .../introduction.html | 13 +- ...ing-max-batch-size-and-max-num-tokens.html | 13 +- .../useful-build-time-flags.html | 13 +- .../useful-runtime-flags.html | 13 +- .../python-api/tensorrt_llm.functional.html | 14 +- .../python-api/tensorrt_llm.layers.html | 13 +- .../python-api/tensorrt_llm.models.html | 13 +- .../python-api/tensorrt_llm.plugin.html | 13 +- .../python-api/tensorrt_llm.quantization.html | 13 +- .../python-api/tensorrt_llm.runtime.html | 13 +- latest/legacy/reference/memory.html | 17 +- .../multimodal-feature-support-matrix.html | 21 +- latest/legacy/reference/precision.html | 33 +- latest/legacy/reference/support-matrix.html | 13 +- latest/legacy/reference/troubleshooting.html | 13 +- latest/legacy/tensorrt_quickstart.html | 13 +- latest/legacy/torch.html | 13 +- latest/llm-api/index.html | 13 +- latest/llm-api/reference.html | 208 +- latest/models/adding-new-model.html | 13 +- latest/models/supported-models.html | 31 +- latest/objects.inv | Bin 181378 -> 182848 bytes latest/overview.html | 37 +- latest/py-modindex.html | 13 +- latest/quick-start-guide.html | 21 +- latest/release-notes.html | 13 +- latest/search.html | 13 +- latest/searchindex.js | 2 +- latest/torch/adding_new_model.html | 13 +- latest/torch/arch_overview.html | 13 +- latest/torch/attention.html | 13 +- .../benchmarking_with_trtllm_bench.html | 13 +- .../auto_deploy/advanced/example_run.html | 13 +- .../advanced/expert_configurations.html | 103 +- .../torch/auto_deploy/advanced/logging.html | 13 +- .../advanced/serving_with_trtllm_serve.html | 13 +- .../torch/auto_deploy/advanced/workflow.html | 13 +- latest/torch/auto_deploy/auto-deploy.html | 13 +- latest/torch/auto_deploy/support_matrix.html | 13 +- latest/torch/features/checkpoint_loading.html | 13 +- latest/torch/features/lora.html | 13 +- latest/torch/features/overlap_scheduler.html | 13 +- latest/torch/features/quantization.html | 13 +- latest/torch/features/sampling.html | 13 +- latest/torch/kv_cache_manager.html | 13 +- latest/torch/scheduler.html | 13 +- 318 files changed, 24429 insertions(+), 15905 deletions(-) create mode 100644 latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt create mode 100644 latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt create mode 100644 latest/_sources/features/guided-decoding.md.txt create mode 100644 latest/_sources/features/helix.md.txt create mode 100644 latest/_sources/features/kv-cache-connector.md.txt create mode 100644 latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html create mode 100644 latest/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html create mode 100644 latest/features/guided-decoding.html create mode 100644 latest/features/helix.html create mode 100644 latest/features/kv-cache-connector.html diff --git a/latest/.buildinfo b/latest/.buildinfo index f5f2f10e8a..239d52c49f 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: e877fa21f4c01def0efb8f650d34bf16 +config: e432c3509163ef03323e39d8537d99ca tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index 366b7645cb..e8b4a9df9a 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -374,7 +374,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -419,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -419,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -419,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -419,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -419,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:

    -
    \text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
    -
    -
    +
    +\[ +\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1} +\]

    Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):

    -
    \text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
    -
    -
    -
    \text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
    -
    -
    +
    +\[ +\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N} +\]
    +
    +\[ +\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}} +\]

    End-to-End (E2E) Latency#

    @@ -699,18 +709,20 @@ INFO: Uvicorn running
  • The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.

  • -
    \text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}} +\]

    Tokens Per Second (TPS) or Output Token Throughput#

    -
    \text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} +\]
    @@ -1006,9 +1018,9 @@ trtllm-serve ${m diff --git a/latest/commands/trtllm-serve/trtllm-serve.html b/latest/commands/trtllm-serve/trtllm-serve.html index 9f00d34205..2e49705978 100644 --- a/latest/commands/trtllm-serve/trtllm-serve.html +++ b/latest/commands/trtllm-serve/trtllm-serve.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -376,7 +376,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    Developer Guide

    End-to-End (E2E) Latency#

    @@ -981,18 +991,20 @@ chmod +x bench.sh -
    \text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}} +\]

    Tokens Per Second (TPS) or Output Token Throughput#

    -
    \text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} +\]
    @@ -1180,9 +1192,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html index eed951928e..a3dcb09b28 100644 --- a/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html @@ -57,11 +57,13 @@ + + @@ -70,13 +72,13 @@ - + - + @@ -376,7 +378,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:

    -
    \text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
    -
    -
    +
    +\[ +\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1} +\]

    Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):

    -
    \text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
    -
    -
    -
    \text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
    -
    -
    +
    +\[ +\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N} +\]
    +
    +\[ +\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}} +\]

    End-to-End (E2E) Latency#

    @@ -944,18 +953,20 @@ chmod +x bench.sh -
    \text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}} +\]

    Tokens Per Second (TPS) or Output Token Throughput#

    -
    \text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} +\]
    @@ -981,11 +992,11 @@ chmod +x bench.sh

    next

    -

    Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware

    +

    Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware

    @@ -1140,9 +1151,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html new file mode 100644 index 0000000000..4fb0ef600e --- /dev/null +++ b/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html @@ -0,0 +1,982 @@ + + + + + + + + + + + + Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell — TensorRT LLM + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + +
    + +
    + + + + + +
    +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    + +
    + +
    + + +
    + +
    + + +
    +
    + + + + + +
    + +
    +

    Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell#

    +
    +

    Introduction#

    +

    This is a quickstart guide for running the Kimi K2 Thinking model on TensorRT LLM. It focuses on a working setup with recommended defaults.

    +
    +
    +

    Prerequisites#

    +
      +
    • GPU: NVIDIA Blackwell Architecture

    • +
    • OS: Linux

    • +
    • Drivers: CUDA Driver 575 or Later

    • +
    • Docker with NVIDIA Container Toolkit installed

    • +
    • Python3 and python3-pip (Optional, for accuracy evaluation only)

    • +
    +
    +
    +

    Models#

    + +
    +
    +

    Deploy Kimi K2 Thinking on DGX B200 through Docker#

    +
    +

    Prepare Docker image#

    +

    Build and run the docker container. See the Docker guide for details.

    +
    cd TensorRT-LLM
    +
    +make -C docker release_build IMAGE_TAG=kimi-k2-thinking-local
    +
    +make -C docker release_run IMAGE_NAME=tensorrt_llm IMAGE_TAG=kimi-k2-thinking-local LOCAL_USER=1
    +
    +
    +
    +
    +

    Launch the TensorRT LLM Server#

    +

    Prepare an EXTRA_OPTIONS_YAML_FILE that specifies LLM API arguments when deploying the model. An example YAML file is as follows:

    +
    max_batch_size: 128
    +max_num_tokens: 8448
    +max_seq_len: 8212
    +tensor_parallel_size: 8
    +moe_expert_parallel_size: 8
    +enable_attention_dp: true
    +pipeline_parallel_size: 1
    +print_iter_log: true
    +kv_cache_config:
    +  free_gpu_memory_fraction: 0.75
    +  dtype: fp8
    +cache_transceiver_config:
    +  backend: UCX
    +  max_tokens_in_buffer: 8448
    +trust_remote_code: true
    +
    +
    +

    This YAML file specifies configurations that deploy the model with 8-way expert parallelism for the MoE part and 8-way attention data parallelism. It also enables trust_remote_code, so that it works with the Kimi K2 Thinking customized tokenizer.

    +

    With the EXTRA_OPTIONS_YAML_FILE, use the following example command to launch the TensorRT LLM server with the Kimi-K2-Thinking-NVFP4 model from within the container.

    +
    trtllm-serve nvidia/Kimi-K2-Thinking-NVFP4 \
    +    --host 0.0.0.0 --port 8000 \
    +    --extra_llm_api_options ${EXTRA_OPTIONS_YAML_FILE}
    +
    +
    +

    TensorRT LLM will load weights and select the best kernels during startup. The server is successfully launched when the following log is shown:

    +
    INFO:     Started server process [xxxxx]
    +INFO:     Waiting for application startup.
    +INFO:     Application startup complete.
    +INFO:     Uvicorn running on http://localhost:8000 (Press CTRL+C to quit)
    +
    +
    +

    You can query the health/readiness of the server using:

    +
    curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
    +
    +
    +

    When the Status: 200 code is returned, the server is ready for queries.

    +
    +
    +
    +

    Deploy Kimi K2 Thinking on GB200 NVL72 through SLURM with wide EP and disaggregated serving#

    +

    TensorRT LLM provides a set of SLURM scripts that can be easily configured through YAML files and automatically launch SLURM jobs on GB200 NVL72 clusters for deployment, benchmarking, and accuracy testing purposes. The scripts are located at examples/disaggregated/slurm/benchmark. Refer to this page for more details and example wide EP config files.

    +

    For Kimi K2 Thinking, an example configuration for SLURM arguments and the scripts is as follows:

    +
    # SLURM Configuration
    +slurm:
    +  script_file: "disaggr_torch.slurm"
    +  partition: "<partition>"
    +  account: "<account>"
    +  job_time: "02:00:00"
    +  job_name: "<job_name>"
    +  extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
    +  numa_bind: true # Only enable for GB200 NVL72
    +
    +# Benchmark Mode
    +benchmark:
    +  mode: "e2e"  # Options: e2e, gen_only
    +  use_nv_sa_benchmark: false  # Whether to use NVIDIA SA benchmark script
    +  multi_round: 8  # Number of benchmark rounds
    +  benchmark_ratio: 0.8  # Benchmark ratio
    +  streaming: true  # Enable streaming mode
    +  concurrency_list: "16"
    +  input_length: 1024  # Input sequence length
    +  output_length: 1024  # Output sequence length
    +  dataset_file: "<dataset_file>"
    +
    +# Hardware Configuration
    +hardware:
    +  gpus_per_node: 4  # Modify this with your hardware configuration
    +  num_ctx_servers: 4  # Number of context servers
    +  num_gen_servers: 1  # Number of generation servers
    +
    +# Environment Configuration
    +environment:
    +  container_mount: "<container_mount>"  # Format: path1:path1,path2:path2
    +  container_image: "<container_image>"
    +  model_path: "<model_path>"
    +  trtllm_repo: "<trtllm_repo>"
    +  build_wheel: false  # Don't build the wheel when launching multiple jobs
    +  trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
    +  work_dir: "<full_path_to_work_dir>"
    +  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
    +  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
    +
    +# Worker Configuration
    +worker_config:
    +  gen:
    +    tensor_parallel_size: 32
    +    moe_expert_parallel_size: 32
    +    enable_attention_dp: true
    +    enable_lm_head_tp_in_adp: true
    +    pipeline_parallel_size: 1
    +    max_batch_size: 128
    +    max_num_tokens: 128
    +    max_seq_len: 9236
    +    cuda_graph_config:
    +      enable_padding: true
    +      batch_sizes:
    +      - 1
    +      - 2
    +      - 4
    +      - 8
    +      - 16
    +      - 32
    +      - 64
    +      - 128
    +      - 256
    +      - 512
    +      - 768
    +      - 1024
    +      - 2048
    +    print_iter_log: true
    +    kv_cache_config:
    +      enable_block_reuse: false
    +      free_gpu_memory_fraction: 0.6
    +      dtype: fp8
    +    moe_config:
    +      backend: WIDEEP
    +      use_low_precision_moe_combine: true
    +      load_balancer:
    +        num_slots: 416
    +        layer_updates_per_iter: 1
    +    cache_transceiver_config:
    +      backend: UCX
    +      max_tokens_in_buffer: 8448
    +    stream_interval: 20
    +    num_postprocess_workers: 4
    +    trust_remote_code: true
    +  ctx:
    +    max_batch_size: 1
    +    max_num_tokens: 8448
    +    max_seq_len: 8212
    +    tensor_parallel_size: 4
    +    moe_expert_parallel_size: 4
    +    enable_attention_dp: true
    +    pipeline_parallel_size: 1
    +    print_iter_log: true
    +    cuda_graph_config: null
    +    disable_overlap_scheduler: true
    +    kv_cache_config:
    +      enable_block_reuse: false
    +      free_gpu_memory_fraction: 0.75
    +      dtype: fp8
    +    cache_transceiver_config:
    +      backend: UCX
    +      max_tokens_in_buffer: 8448
    +    trust_remote_code: true
    +
    +
    +

    It includes SLURM-specific configurations, benchmark and hardware details, and environment settings. The worker_config field includes detailed settings for context and generation servers when deploying a disaggregated server, with each specified as a list of LLM API arguments.

    +

    To launch SLURM jobs with the YAML config file, execute the following command:

    +
    cd <TensorRT LLM root>/examples/disaggregated/slurm/benchmark
    +python3 submit.py -c config.yaml
    +
    +
    +
    +
    +

    Query the OpenAI-compatible API Endpoint#

    +

    After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.

    +
    curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json"  -d '{
    +    "model": "nvidia/Kimi-K2-Thinking-NVFP4",
    +    "messages": [
    +        {
    +            "role": "user",
    +            "content": "Where is New York?"
    +        }
    +    ],
    +    "max_tokens": 128,
    +    "top_p": 1.0
    +}' -w "\n"
    +
    +
    +

    Example response:

    +
    {
    +  "id": "chatcmpl-5907ed752eb44d11a12893b19f79f8ca",
    +  "object": "chat.completion",
    +  "created": 1764866686,
    +  "model": "nvidia/Kimi-K2-Thinking-NVFP4",
    +  "choices": [
    +    {
    +      "index": 0,
    +      "message": {
    +        "role": "assistant",
    +        "content": "<think> The user is asking a very simple question: \"Where is New York?\" This could be interpreted in a few ways:\n\n1. Where is New York State located?\n2. Where is New York City located?\n3. Where is New York located in relation to something else?\n\nGiven the ambiguity, I should provide a comprehensive answer that covers the main interpretations. I should be clear and direct.\n\nLet me structure my answer:\n- First, clarify that \"New York\" can refer to either New York State or New York City\n- For New York State: It's located in the northeastern United States, bordered by New Jersey, Pennsylvania, Connecticut",
    +        "reasoning_content": "",
    +        "reasoning": null,
    +        "tool_calls": []
    +      },
    +      "logprobs": null,
    +      "finish_reason": "length",
    +      "stop_reason": null,
    +      "mm_embedding_handle": null,
    +      "disaggregated_params": null,
    +      "avg_decoded_tokens_per_iter": 1.0
    +    }
    +  ],
    +  "usage": {
    +    "prompt_tokens": 12,
    +    "total_tokens": 140,
    +    "completion_tokens": 128,
    +    "prompt_tokens_details": {
    +      "cached_tokens": 0
    +    }
    +  },
    +  "prompt_token_ids": null
    +}
    +
    +
    +
    +
    +

    Benchmark#

    +

    To benchmark the performance of your TensorRT LLM server, you can leverage the built-in benchmark_serving.py script. To do this, first create a wrapper bench.sh script.

    +
    cat <<'EOF' > bench.sh
    +#!/usr/bin/env bash
    +set -euo pipefail
    +
    +concurrency_list="1 2 4 8 16 32 64 128 256"
    +multi_round=5
    +isl=1024
    +osl=1024
    +result_dir=/tmp/kimi_k2_thinking_output
    +
    +for concurrency in ${concurrency_list}; do
    +    num_prompts=$((concurrency * multi_round))
    +    python -m tensorrt_llm.serve.scripts.benchmark_serving \
    +        --model nvidia/Kimi-K2-Thinking-NVFP4 \
    +        --backend openai \
    +        --dataset-name "random" \
    +        --random-input-len ${isl} \
    +        --random-output-len ${osl} \
    +        --random-prefix-len 0 \
    +        --random-ids \
    +        --num-prompts ${num_prompts} \
    +        --max-concurrency ${concurrency} \
    +        --ignore-eos \
    +        --tokenize-on-client \
    +        --percentile-metrics "ttft,tpot,itl,e2el"
    +done
    +EOF
    +chmod +x bench.sh
    +
    +
    +

    If you want to save the results to a file, add the following options:

    +
    --save-result \
    +--result-dir "${result_dir}" \
    +--result-filename "concurrency_${concurrency}.json"
    +
    +
    +

    For more benchmarking options, see benchmark_serving.py.

    +

    Run bench.sh to begin a serving benchmark.

    +
    ./bench.sh
    +
    +
    +
    +
    + + +
    + + + + + + + +
    + + + + + + + + + + +
    +
    + +
    + +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html index 5472d7d39f..82efa16aeb 100644 --- a/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html @@ -57,11 +57,13 @@ + + @@ -76,7 +78,7 @@ - + @@ -376,7 +378,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:

    -
    \text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
    -
    -
    +
    +\[ +\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1} +\]

    Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):

    -
    \text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
    -
    -
    -
    \text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
    -
    -
    +
    +\[ +\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N} +\]
    +
    +\[ +\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}} +\]

    End-to-End (E2E) Latency#

    @@ -860,18 +870,20 @@ chmod +x bench.sh -
    \text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}} +\]

    Tokens Per Second (TPS) or Output Token Throughput#

    -
    \text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} +\]
    @@ -1058,9 +1070,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html index 556dd06599..6f92d23017 100644 --- a/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html @@ -57,11 +57,13 @@ + + @@ -76,7 +78,7 @@ - + @@ -376,7 +378,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:

    -
    \text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
    -
    -
    +
    +\[ +\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1} +\]

    Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):

    -
    \text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
    -
    -
    -
    \text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
    -
    -
    +
    +\[ +\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N} +\]
    +
    +\[ +\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}} +\]

    End-to-End (E2E) Latency#

    @@ -888,18 +898,20 @@ chmod +x bench.sh -
    \text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}} +\]

    Tokens Per Second (TPS) or Output Token Throughput#

    -
    \text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
    -
    -
    +
    +\[ +\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} +\]
    @@ -1086,9 +1098,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html index fdcf4af7bd..fb316de6ec 100644 --- a/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html @@ -61,7 +61,7 @@ @@ -70,13 +70,13 @@ - - + + - + @@ -376,7 +376,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +421,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -766,9 +775,9 @@ diff --git a/latest/developer-guide/api-change.html b/latest/developer-guide/api-change.html index 832fb21374..000ef90c1c 100644 --- a/latest/developer-guide/api-change.html +++ b/latest/developer-guide/api-change.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -376,7 +376,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -805,9 +811,9 @@ In addition, the following models have been officially validated using the defau diff --git a/latest/features/checkpoint-loading.html b/latest/features/checkpoint-loading.html index d6a0358b1f..ca9b674b58 100644 --- a/latest/features/checkpoint-loading.html +++ b/latest/features/checkpoint-loading.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -376,7 +376,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -410,11 +412,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    +
  • + --chat_template + +
  • @@ -578,6 +590,8 @@
  • trtllm-eval-gpqa_main command line option
  • trtllm-eval-gsm8k command line option +
  • +
  • trtllm-eval-longbench_v2 command line option
  • trtllm-eval-mmlu command line option
  • @@ -597,7 +611,7 @@
  • @@ -607,6 +621,19 @@
  • trtllm-bench-latency command line option
  • trtllm-bench-throughput command line option +
  • + +
  • + --config + +
  • @@ -616,6 +643,13 @@
  • trtllm-serve-disaggregated command line option
  • trtllm-serve-disaggregated_mpi_worker command line option +
  • + +
  • + --context_parallel_size + +
  • @@ -623,6 +657,13 @@
  • +
  • + --cp_size + +
  • @@ -755,7 +796,7 @@
  • @@ -769,13 +810,13 @@ --extra_llm_api_options
  • @@ -790,6 +831,13 @@
  • +
  • + --free_gpu_memory_fraction + +
  • @@ -843,7 +891,7 @@
  • @@ -1043,6 +1091,20 @@
  • +
  • + --moe_cluster_parallel_size + +
  • +
  • + --moe_expert_parallel_size + +
  • @@ -1131,6 +1193,13 @@
  • +
  • + --pipeline_parallel_size + +
  • @@ -1159,7 +1228,7 @@
  • trtllm-eval command line option
  • -
  • trtllm-serve-serve command line option +
  • trtllm-serve-serve command line option
  • @@ -1234,6 +1303,17 @@
  • +
  • + --revision + +
  • @@ -1326,6 +1406,13 @@
  • trtllm-bench-build command line option
  • trtllm-bench-throughput command line option +
  • + +
  • + --tensor_parallel_size + +
  • @@ -1361,7 +1448,7 @@
  • trtllm-eval command line option
  • -
  • trtllm-serve-serve command line option +
  • trtllm-serve-serve command line option
  • @@ -1784,10 +1871,10 @@
  • (tensorrt_llm.runtime.SamplingConfig attribute)
  • -
  • beam_width_array (tensorrt_llm.llmapi.SamplingParams attribute) -
  • - - + + - - - + - + - +
    @@ -1150,9 +1173,9 @@ diff --git a/latest/installation/build-from-source-linux.html b/latest/installation/build-from-source-linux.html index ae3764120d..3ca344064a 100644 --- a/latest/installation/build-from-source-linux.html +++ b/latest/installation/build-from-source-linux.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -376,7 +376,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    -

    To use the C++ benchmark scripts under benchmark/cpp, for example gptManagerBenchmark.cpp, add the --benchmarks option:

    +

    To use the C++ benchmark scripts under benchmark/cpp, for example gptManagerBenchmark.cpp, add the --benchmarks option:

    python3 ./scripts/build_wheel.py --benchmarks
     
    @@ -898,9 +903,9 @@ pip install ./build/tensorrt_llm*. diff --git a/latest/installation/containers.html b/latest/installation/containers.html index ec42520286..045a5244a6 100644 --- a/latest/installation/containers.html +++ b/latest/installation/containers.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -372,7 +372,9 @@
  • Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
  • +
  • Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
  • Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
  • +
  • Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
  • @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -413,11 +415,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -411,11 +413,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -412,11 +414,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -415,11 +417,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -417,11 +419,14 @@
  • Quantization
  • Sampling
  • Additional Outputs
  • +
  • Guided Decoding
  • Speculative Decoding
  • Checkpoint Loading
  • AutoDeploy (Prototype)
  • Ray Orchestrator (Prototype)
  • Torch Compile & Piecewise CUDA Graph
  • +
  • Helix Parallelism
  • +
  • KV Cache Connector
  • Developer Guide

    @@ -5352,7 +5360,7 @@ a subset of the possible backends.

    -validator validate_positive_values  »  max_window_size, max_verification_set_size, max_ngram_size[source]#
    +validator validate_positive_values  »  max_ngram_size, max_verification_set_size, max_window_size[source]#
    @@ -5360,6 +5368,11 @@ a subset of the possible backends.

    decoding_type: ClassVar[str] = 'Lookahead'#
    +
    +
    +property is_linear_tree: bool#
    +
    +
    model_computed_fields = {}#
    @@ -6079,6 +6092,11 @@ a subset of the possible backends.

    decoding_type: ClassVar[str] = 'Medusa'#
    +
    +
    +property is_linear_tree: bool#
    +
    +
    model_computed_fields = {}#
    @@ -7613,6 +7631,11 @@ a subset of the possible backends.

    decoding_type: ClassVar[str] = 'MTP'#
    +
    +
    +property is_linear_tree: bool#
    +
    +
    model_computed_fields = {}#
    @@ -10326,6 +10349,11 @@ left untouched. Characters mapped to None are deleted.

    NVFP4 = 'NVFP4'#
    +
    +
    +NVFP4_AWQ = 'NVFP4_AWQ'#
    +
    +
    W4A16 = 'W4A16'#
    @@ -15133,6 +15161,11 @@ a subset of the possible backends.

    decoding_type: ClassVar[str] = 'NGram'#
    +
    +
    +property is_linear_tree: bool#
    +
    +
    model_computed_fields = {}#
    @@ -15852,6 +15885,11 @@ a subset of the possible backends.

    decoding_type: ClassVar[str] = 'User_Provided'#
    +
    +
    +property is_linear_tree: bool#
    +
    +
    model_computed_fields = {}#
    @@ -17229,6 +17267,11 @@ a subset of the possible backends.

    decoding_type: ClassVar[str] = 'Draft_Target'#
    +
    +
    +property is_linear_tree: bool#
    +
    +
    model_computed_fields = {}#
    @@ -17342,12 +17385,14 @@ a subset of the possible backends.

    return_perf_metrics: bool = False,
    orchestrator_type: ~typing.Literal['rpc',
    'ray'] | None = None,
    -
    build_config: ~tensorrt_llm.builder.BuildConfig | None = None,
    +
    env_overrides: ~typing.Dict[str,
    +
    str] | None = None,
    garbage_collection_gen0_threshold: int = 20000,
    cuda_graph_config: ~tensorrt_llm.llmapi.llm_args.CudaGraphConfig | None = <factory>,
    attention_dp_config: ~tensorrt_llm.llmapi.llm_args.AttentionDpConfig | None = None,
    disable_overlap_scheduler: bool = False,
    moe_config: ~tensorrt_llm.llmapi.llm_args.MoeConfig = <factory>,
    +
    nvfp4_gemm_config: ~tensorrt_llm.llmapi.llm_args.Nvfp4GemmConfig = <factory>,
    attn_backend: str = 'TRTLLM',
    sampler_type: str | ~tensorrt_llm.llmapi.llm_args.SamplerType = SamplerType.auto,
    enable_iter_perf_stats: bool = False,
    @@ -17378,6 +17423,7 @@ a subset of the possible backends.

    mm_encoder_only: bool = False,
    ray_worker_extension_cls: str | None = None,
    enable_sleep: bool = False,
    +
    disable_flashinfer_sampling: bool = False,
    )[source]# @@ -17430,12 +17476,6 @@ a subset of the possible backends.

    stable Batched logits processor.

    -
    -
    -field build_config: BuildConfig | None = None#
    -

    deprecated Build config.

    -
    -
    field cache_transceiver_config: CacheTransceiverConfig | None = None#
    @@ -17476,6 +17516,12 @@ If checkpoint_format and checkpoint_loader are both provided, checkpoint_loader

    beta CUDA graph config.If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_config.batch_sizes, and are enabled for batches that consist of decoding requests only (the reason is that it’s hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory.

    +
    +
    +field disable_flashinfer_sampling: bool = False#
    +

    prototype Disable the use of FlashInfer.sampling. This option is likely to be removed in the future.

    +
    +
    field disable_overlap_scheduler: bool = False#
    @@ -17548,6 +17594,12 @@ If checkpoint_format and checkpoint_loader are both provided, checkpoint_loader

    prototype Enable LLM sleep feature. Sleep feature requires extra setup that may slowdown model loading.Only enable it if you intend to use this feature.

    +
    +
    +field env_overrides: Dict[str, str] | None = None#
    +

    prototype [EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won’t update unless the code fetches them from os.environ on demand.

    +
    +
    field fail_fast_on_attention_window_too_large: bool = False#
    @@ -17692,6 +17744,12 @@ If checkpoint_format and checkpoint_loader are both provided, checkpoint_loader

    prototype The number of processes used for postprocessing the generated tokens, including detokenization.

    +
    +
    +field nvfp4_gemm_config: Nvfp4GemmConfig [Optional]#
    +

    beta NVFP4 GEMM backend config.

    +
    +
    field orchestrator_type: Literal['rpc', 'ray'] | None = None#
    @@ -17912,22 +17970,11 @@ validated to form a valid model.

    validator init_backend  »  backend[source]#
    -
    -
    -validator init_build_config  »  all fields#
    -

    Creating a default BuildConfig if none is provided

    -
    -
    validator set_default_max_input_len  »  all fields#
    -
    -
    -validator set_runtime_knobs_from_build_config  »  all fields#
    -
    -
    validator sync_quant_config_with_kv_cache_config_dtype  »  all fields[source]#
    @@ -17964,16 +18011,6 @@ validated to form a valid model.

    Validate batch wait timeout.

    -
    -
    -validator validate_build_config_remaining  »  all fields#
    -
    - -
    -
    -validator validate_build_config_with_runtime_params  »  all fields#
    -
    -
    validator validate_checkpoint_format  »  all fields[source]#
    @@ -18010,19 +18047,14 @@ validated to form a valid model.

    -
    -validator validate_model  »  model#
    +
    +validator validate_misc  »  all fields[source]#
    -
    -validator validate_model_format_misc  »  all fields#
    -

    Load the model format, and do the following:

    -
      -
    1. Load the build_config if got an engine.

    2. -
    3. Load the parallel_config if got a checkpoint.

    4. -
    -
    +
    +validator validate_model  »  model#
    +
    @@ -18046,7 +18078,7 @@ validated to form a valid model.

    -validator validate_speculative_config  »  all fields#
    +validator validate_speculative_config  »  all fields[source]#
    @@ -18180,6 +18212,8 @@ validated to form a valid model.

    return_perf_metrics: bool = False,
    orchestrator_type: ~typing.Literal['rpc',
    'ray'] | None = None,
    +
    env_overrides: ~typing.Dict[str,
    +
    str] | None = None,
    enable_tqdm: bool = False,
    workspace: str | None = None,
    enable_build_cache: object = False,
    @@ -18299,6 +18333,12 @@ validated to form a valid model.

    Enable tqdm for progress bar.

    +
    +
    +field env_overrides: Dict[str, str] | None = None#
    +

    [EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won’t update unless the code fetches them from os.environ on demand.

    +
    +
    field extended_runtime_perf_knob_config: ExtendedRuntimePerfKnobConfig | None = None#
    @@ -18613,7 +18653,7 @@ validated to form a valid model.

    -validator init_build_config  »  all fields#
    +validator init_build_config  »  all fields[source]#

    Creating a default BuildConfig if none is provided

    @@ -18627,11 +18667,6 @@ validated to form a valid model.

    validator set_default_max_input_len  »  all fields#
    -
    -
    -validator set_runtime_knobs_from_build_config  »  all fields#
    -
    -
    validator setup_embedding_parallel_mode  »  all fields[source]#
    @@ -18645,12 +18680,12 @@ validated to form a valid model.

    -validator validate_build_config_remaining  »  all fields#
    +validator validate_build_config_remaining  »  all fields[source]#
    -validator validate_build_config_with_runtime_params  »  all fields#
    +validator validate_build_config_with_runtime_params  »  all fields[source]#
    @@ -18685,7 +18720,7 @@ validated to form a valid model.

    -validator validate_model_format_misc  »  all fields#
    +validator validate_model_format_misc  »  all fields[source]#

    Load the model format, and do the following:

    1. Load the build_config if got an engine.

    2. @@ -18715,7 +18750,7 @@ validated to form a valid model.

      -validator validate_speculative_config  »  all fields#
      +validator validate_speculative_config  »  all fields[source]#
      @@ -19420,6 +19455,11 @@ a subset of the possible backends.

      decoding_type: ClassVar[str] = 'AUTO'#
      +
      +
      +property is_linear_tree: bool#
      +
      +
      model_computed_fields = {}#
      @@ -20888,6 +20928,11 @@ a subset of the possible backends.

      decoding_type: ClassVar[str] = 'SaveState'#
      +
      +
      +property is_linear_tree: bool#
      +
      +
      model_computed_fields = {}#
      @@ -20951,12 +20996,13 @@ Otherwise, assume Eagle3 base set and return 3 + 1 (for post norm last hidden st
      *,
      -
      window_size: int | None = None,
      -
      kernel_size: int | None = None,
      -
      topr: int | float | None = 76,
      -
      topk: int | None = 128,
      -
      prompt_budget: int | None = 1266,
      -
      page_size: int | None = 3,
      +
      window_size: int | None = 32,
      +
      kernel_size: int | None = 63,
      +
      topr: int | float | None = 128,
      +
      topk: int | None = 64,
      +
      prompt_budget: int | None = 2048,
      +
      page_size: int | None = 4,
      +
      kt_cache_dtype: str | None = 'float8_e5m2',
      )[source]# @@ -20964,37 +21010,43 @@ Otherwise, assume Eagle3 base set and return 3 + 1 (for post norm last hidden st

      Configuration for RocketKV sparse attention.

      -field kernel_size: int | None = None#
      +field kernel_size: int | None = 63#

      The kernel size for snap KV.

      +
      +
      +field kt_cache_dtype: str | None = 'float8_e5m2'#
      +

      KT cache dtype

      +
      +
      -field page_size: int | None = 3#
      +field page_size: int | None = 4#

      Page size

      -field prompt_budget: int | None = 1266#
      +field prompt_budget: int | None = 2048#

      Prompt budget

      -field topk: int | None = 128#
      +field topk: int | None = 64#

      Top-k

      -field topr: int | float | None = 76#
      +field topr: int | float | None = 128#

      Top-r

      -field window_size: int | None = None#
      +field window_size: int | None = 32#

      The window size for snap KV.

      @@ -21631,7 +21683,7 @@ a subset of the possible backends.

      -model_fields = {'kernel_size': FieldInfo(annotation=Union[int, NoneType], required=False, default=None, description='The kernel size for snap KV.'), 'page_size': FieldInfo(annotation=Union[int, NoneType], required=False, default=3, description='Page size'), 'prompt_budget': FieldInfo(annotation=Union[int, NoneType], required=False, default=1266, description='Prompt budget'), 'topk': FieldInfo(annotation=Union[int, NoneType], required=False, default=128, description='Top-k'), 'topr': FieldInfo(annotation=Union[int, float, NoneType], required=False, default=76, description='Top-r'), 'window_size': FieldInfo(annotation=Union[int, NoneType], required=False, default=None, description='The window size for snap KV.')}#
      +model_fields = {'kernel_size': FieldInfo(annotation=Union[int, NoneType], required=False, default=63, description='The kernel size for snap KV.'), 'kt_cache_dtype': FieldInfo(annotation=Union[str, NoneType], required=False, default='float8_e5m2', description='KT cache dtype', json_schema_extra={'choices': ['bfloat16', 'float8_e5m2']}), 'page_size': FieldInfo(annotation=Union[int, NoneType], required=False, default=4, description='Page size'), 'prompt_budget': FieldInfo(annotation=Union[int, NoneType], required=False, default=2048, description='Prompt budget'), 'topk': FieldInfo(annotation=Union[int, NoneType], required=False, default=64, description='Top-k'), 'topr': FieldInfo(annotation=Union[int, float, NoneType], required=False, default=128, description='Top-r'), 'window_size': FieldInfo(annotation=Union[int, NoneType], required=False, default=32, description='The window size for snap KV.')}#
      @@ -22795,6 +22847,7 @@ a subset of the possible backends.

    3. validate_draft_len_schedule_and_sort
    4. validate_positive_values
    5. decoding_type
    6. +
    7. is_linear_tree
    8. model_computed_fields
    9. model_config
    10. model_extra
    11. @@ -22846,6 +22899,7 @@ a subset of the possible backends.

    12. validate()
    13. validate_draft_len_schedule_and_sort
    14. decoding_type
    15. +
    16. is_linear_tree
    17. model_computed_fields
    18. model_config
    19. model_extra
    20. @@ -22965,6 +23019,7 @@ a subset of the possible backends.

    21. validate()
    22. validate_draft_len_schedule_and_sort
    23. decoding_type
    24. +
    25. is_linear_tree
    26. model_computed_fields
    27. model_config
    28. model_extra
    29. @@ -23213,6 +23268,7 @@ a subset of the possible backends.

    30. MIXED_PRECISION
    31. NO_QUANT
    32. NVFP4
    33. +
    34. NVFP4_AWQ
    35. W4A16
    36. W4A16_AWQ
    37. W4A16_GPTQ
    38. @@ -23596,6 +23652,7 @@ a subset of the possible backends.

    39. validate()
    40. validate_draft_len_schedule_and_sort
    41. decoding_type
    42. +
    43. is_linear_tree
    44. model_computed_fields
    45. model_config
    46. model_extra
    47. @@ -23647,6 +23704,7 @@ a subset of the possible backends.

    48. validate()
    49. validate_draft_len_schedule_and_sort
    50. decoding_type
    51. +
    52. is_linear_tree
    53. model_computed_fields
    54. model_config
    55. model_extra
    56. @@ -23740,6 +23798,7 @@ a subset of the possible backends.

    57. validate()
    58. validate_draft_len_schedule_and_sort
    59. decoding_type
    60. +
    61. is_linear_tree
    62. model_computed_fields
    63. model_config
    64. model_extra
    65. @@ -23758,13 +23817,13 @@ a subset of the possible backends.

    66. batch_wait_timeout_iters
    67. batch_wait_timeout_ms
    68. batched_logits_processor
    69. -
    70. build_config
    71. cache_transceiver_config
    72. checkpoint_format
    73. checkpoint_loader
    74. context_parallel_size
    75. cp_config
    76. cuda_graph_config
    77. +
    78. disable_flashinfer_sampling
    79. disable_overlap_scheduler
    80. dtype
    81. enable_attention_dp
    82. @@ -23777,6 +23836,7 @@ a subset of the possible backends.

    83. enable_lora
    84. enable_min_latency
    85. enable_sleep
    86. +
    87. env_overrides
    88. fail_fast_on_attention_window_too_large
    89. force_dynamic_quantization
    90. garbage_collection_gen0_threshold
    91. @@ -23801,6 +23861,7 @@ a subset of the possible backends.

    92. moe_tensor_parallel_size
    93. mpi_session
    94. num_postprocess_workers
    95. +
    96. nvfp4_gemm_config
    97. orchestrator_type
    98. otlp_traces_endpoint
    99. peft_cache_config
    100. @@ -23836,25 +23897,21 @@ a subset of the possible backends.

    101. get_executor_config()
    102. get_runtime_sizes()
    103. init_backend
    104. -
    105. init_build_config
    106. set_default_max_input_len
    107. -
    108. set_runtime_knobs_from_build_config
    109. sync_quant_config_with_kv_cache_config_dtype
    110. validate_and_init_tokenizer
    111. validate_attention_dp_config
    112. validate_batch_wait_max_tokens_ratio
    113. validate_batch_wait_timeout_iters
    114. validate_batch_wait_timeout_ms
    115. -
    116. validate_build_config_remaining
    117. -
    118. validate_build_config_with_runtime_params
    119. validate_checkpoint_format
    120. validate_cuda_graph_config
    121. validate_dtype
    122. validate_gpus_per_node
    123. validate_load_balancer
    124. validate_lora_config_consistency
    125. +
    126. validate_misc
    127. validate_model
    128. -
    129. validate_model_format_misc
    130. validate_parallel_config
    131. validate_peft_cache_config
    132. validate_ray_worker_extension_cls
    133. @@ -23895,6 +23952,7 @@ a subset of the possible backends.

    134. enable_lora
    135. enable_prompt_adapter
    136. enable_tqdm
    137. +
    138. env_overrides
    139. extended_runtime_perf_knob_config
    140. fail_fast_on_attention_window_too_large
    141. fast_build
    142. @@ -23949,7 +24007,6 @@ a subset of the possible backends.

    143. init_build_config
    144. init_calib_config
    145. set_default_max_input_len
    146. -
    147. set_runtime_knobs_from_build_config
    148. setup_embedding_parallel_mode
    149. validate_and_init_tokenizer
    150. validate_build_config_remaining
    151. @@ -24019,6 +24076,7 @@ a subset of the possible backends.

    152. validate()
    153. validate_draft_len_schedule_and_sort
    154. decoding_type
    155. +
    156. is_linear_tree
    157. model_computed_fields
    158. model_config
    159. model_extra
    160. @@ -24125,6 +24183,7 @@ a subset of the possible backends.

    161. validate()
    162. validate_draft_len_schedule_and_sort
    163. decoding_type
    164. +
    165. is_linear_tree
    166. model_computed_fields
    167. model_config
    168. model_extra
    169. @@ -24136,6 +24195,7 @@ a subset of the possible backends.

    170. RocketSparseAttentionConfig
    171. @@ -417,11 +419,14 @@
    172. Quantization
    173. Sampling
    174. Additional Outputs
    175. +
    176. Guided Decoding
    177. Speculative Decoding
    178. Checkpoint Loading
    179. AutoDeploy (Prototype)
    180. Ray Orchestrator (Prototype)
    181. Torch Compile & Piecewise CUDA Graph
    182. +
    183. Helix Parallelism
    184. +
    185. KV Cache Connector
    186. Developer Guide

      @@ -417,11 +419,14 @@
    187. Quantization
    188. Sampling
    189. Additional Outputs
    190. +
    191. Guided Decoding
    192. Speculative Decoding
    193. Checkpoint Loading
    194. AutoDeploy (Prototype)
    195. Ray Orchestrator (Prototype)
    196. Torch Compile & Piecewise CUDA Graph
    197. +
    198. Helix Parallelism
    199. +
    200. KV Cache Connector
    201. Developer Guide

      @@ -417,11 +419,14 @@
    202. Quantization
    203. Sampling
    204. Additional Outputs
    205. +
    206. Guided Decoding
    207. Speculative Decoding
    208. Checkpoint Loading
    209. AutoDeploy (Prototype)
    210. Ray Orchestrator (Prototype)
    211. Torch Compile & Piecewise CUDA Graph
    212. +
    213. Helix Parallelism
    214. +
    215. KV Cache Connector
    216. Developer Guide

      @@ -413,11 +415,14 @@
    217. Quantization
    218. Sampling
    219. Additional Outputs
    220. +
    221. Guided Decoding
    222. Speculative Decoding
    223. Checkpoint Loading
    224. AutoDeploy (Prototype)
    225. Ray Orchestrator (Prototype)
    226. Torch Compile & Piecewise CUDA Graph
    227. +
    228. Helix Parallelism
    229. +
    230. KV Cache Connector
    231. Developer Guide

      @@ -417,11 +419,14 @@
    232. Quantization
    233. Sampling
    234. Additional Outputs
    235. +
    236. Guided Decoding
    237. Speculative Decoding
    238. Checkpoint Loading
    239. AutoDeploy (Prototype)
    240. Ray Orchestrator (Prototype)
    241. Torch Compile & Piecewise CUDA Graph
    242. +
    243. Helix Parallelism
    244. +
    245. KV Cache Connector
    246. Developer Guide

      @@ -415,11 +417,14 @@
    247. Quantization
    248. Sampling
    249. Additional Outputs
    250. +
    251. Guided Decoding
    252. Speculative Decoding
    253. Checkpoint Loading
    254. AutoDeploy (Prototype)
    255. Ray Orchestrator (Prototype)
    256. Torch Compile & Piecewise CUDA Graph
    257. +
    258. Helix Parallelism
    259. +
    260. KV Cache Connector
    261. Developer Guide

      @@ -419,11 +421,14 @@
    262. Quantization
    263. Sampling
    264. Additional Outputs
    265. +
    266. Guided Decoding
    267. Speculative Decoding
    268. Checkpoint Loading
    269. AutoDeploy (Prototype)
    270. Ray Orchestrator (Prototype)
    271. Torch Compile & Piecewise CUDA Graph
    272. +
    273. Helix Parallelism
    274. +
    275. KV Cache Connector
    276. Developer Guide

      @@ -415,11 +417,14 @@
    277. Quantization
    278. Sampling
    279. Additional Outputs
    280. +
    281. Guided Decoding
    282. Speculative Decoding
    283. Checkpoint Loading
    284. AutoDeploy (Prototype)
    285. Ray Orchestrator (Prototype)
    286. Torch Compile & Piecewise CUDA Graph
    287. +
    288. Helix Parallelism
    289. +
    290. KV Cache Connector
    291. Developer Guide

      @@ -415,11 +417,14 @@
    292. Quantization
    293. Sampling
    294. Additional Outputs
    295. +
    296. Guided Decoding
    297. Speculative Decoding
    298. Checkpoint Loading
    299. AutoDeploy (Prototype)
    300. Ray Orchestrator (Prototype)
    301. Torch Compile & Piecewise CUDA Graph
    302. +
    303. Helix Parallelism
    304. +
    305. KV Cache Connector
    306. Developer Guide

      @@ -415,11 +417,14 @@
    307. Quantization
    308. Sampling
    309. Additional Outputs
    310. +
    311. Guided Decoding
    312. Speculative Decoding
    313. Checkpoint Loading
    314. AutoDeploy (Prototype)
    315. Ray Orchestrator (Prototype)
    316. Torch Compile & Piecewise CUDA Graph
    317. +
    318. Helix Parallelism
    319. +
    320. KV Cache Connector
    321. Developer Guide

      @@ -415,11 +417,14 @@
    322. Quantization
    323. Sampling
    324. Additional Outputs
    325. +
    326. Guided Decoding
    327. Speculative Decoding
    328. Checkpoint Loading
    329. AutoDeploy (Prototype)
    330. Ray Orchestrator (Prototype)
    331. Torch Compile & Piecewise CUDA Graph
    332. +
    333. Helix Parallelism
    334. +
    335. KV Cache Connector
    336. Developer Guide

      @@ -411,11 +413,14 @@
    337. Quantization
    338. Sampling
    339. Additional Outputs
    340. +
    341. Guided Decoding
    342. Speculative Decoding
    343. Checkpoint Loading
    344. AutoDeploy (Prototype)
    345. Ray Orchestrator (Prototype)
    346. Torch Compile & Piecewise CUDA Graph
    347. +
    348. Helix Parallelism
    349. +
    350. KV Cache Connector
    351. Developer Guide

      @@ -415,11 +417,14 @@
    352. Quantization
    353. Sampling
    354. Additional Outputs
    355. +
    356. Guided Decoding
    357. Speculative Decoding
    358. Checkpoint Loading
    359. AutoDeploy (Prototype)
    360. Ray Orchestrator (Prototype)
    361. Torch Compile & Piecewise CUDA Graph
    362. +
    363. Helix Parallelism
    364. +
    365. KV Cache Connector
    366. Developer Guide

      @@ -803,9 +898,9 @@ python build_and_run_ad.py diff --git a/latest/torch/auto_deploy/advanced/logging.html b/latest/torch/auto_deploy/advanced/logging.html index 01a66ed444..c3c3b2f025 100644 --- a/latest/torch/auto_deploy/advanced/logging.html +++ b/latest/torch/auto_deploy/advanced/logging.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -370,7 +370,9 @@
    367. Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
    368. Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
    369. Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware
    370. +
    371. Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
    372. Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
    373. +
    374. Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
    375. @@ -411,11 +413,14 @@
    376. Quantization
    377. Sampling
    378. Additional Outputs
    379. +
    380. Guided Decoding
    381. Speculative Decoding
    382. Checkpoint Loading
    383. AutoDeploy (Prototype)
    384. Ray Orchestrator (Prototype)
    385. Torch Compile & Piecewise CUDA Graph
    386. +
    387. Helix Parallelism
    388. +
    389. KV Cache Connector
    390. Developer Guide

      @@ -415,11 +417,14 @@
    391. Quantization
    392. Sampling
    393. Additional Outputs
    394. +
    395. Guided Decoding
    396. Speculative Decoding
    397. Checkpoint Loading
    398. AutoDeploy (Prototype)
    399. Ray Orchestrator (Prototype)
    400. Torch Compile & Piecewise CUDA Graph
    401. +
    402. Helix Parallelism
    403. +
    404. KV Cache Connector
    405. Developer Guide

      @@ -411,11 +413,14 @@
    406. Quantization
    407. Sampling
    408. Additional Outputs
    409. +
    410. Guided Decoding
    411. Speculative Decoding
    412. Checkpoint Loading
    413. AutoDeploy (Prototype)
    414. Ray Orchestrator (Prototype)
    415. Torch Compile & Piecewise CUDA Graph
    416. +
    417. Helix Parallelism
    418. +
    419. KV Cache Connector
    420. Developer Guide

      @@ -415,11 +417,14 @@
    421. Quantization
    422. Sampling
    423. Additional Outputs
    424. +
    425. Guided Decoding
    426. Speculative Decoding
    427. Checkpoint Loading
    428. AutoDeploy (Prototype)
    429. Ray Orchestrator (Prototype)
    430. Torch Compile & Piecewise CUDA Graph
    431. +
    432. Helix Parallelism
    433. +
    434. KV Cache Connector
    435. Developer Guide

      @@ -415,11 +417,14 @@
    436. Quantization
    437. Sampling
    438. Additional Outputs
    439. +
    440. Guided Decoding
    441. Speculative Decoding
    442. Checkpoint Loading
    443. AutoDeploy (Prototype)
    444. Ray Orchestrator (Prototype)
    445. Torch Compile & Piecewise CUDA Graph
    446. +
    447. Helix Parallelism
    448. +
    449. KV Cache Connector
    450. Developer Guide

      @@ -415,11 +417,14 @@
    451. Quantization
    452. Sampling
    453. Additional Outputs
    454. +
    455. Guided Decoding
    456. Speculative Decoding
    457. Checkpoint Loading
    458. AutoDeploy (Prototype)
    459. Ray Orchestrator (Prototype)
    460. Torch Compile & Piecewise CUDA Graph
    461. +
    462. Helix Parallelism
    463. +
    464. KV Cache Connector
    465. Developer Guide

      @@ -415,11 +417,14 @@
    466. Quantization
    467. Sampling
    468. Additional Outputs
    469. +
    470. Guided Decoding
    471. Speculative Decoding
    472. Checkpoint Loading
    473. AutoDeploy (Prototype)
    474. Ray Orchestrator (Prototype)
    475. Torch Compile & Piecewise CUDA Graph
    476. +
    477. Helix Parallelism
    478. +
    479. KV Cache Connector
    480. Developer Guide

      @@ -415,11 +417,14 @@
    481. Quantization
    482. Sampling
    483. Additional Outputs
    484. +
    485. Guided Decoding
    486. Speculative Decoding
    487. Checkpoint Loading
    488. AutoDeploy (Prototype)
    489. Ray Orchestrator (Prototype)
    490. Torch Compile & Piecewise CUDA Graph
    491. +
    492. Helix Parallelism
    493. +
    494. KV Cache Connector
    495. Developer Guide

      @@ -411,11 +413,14 @@
    496. Quantization
    497. Sampling
    498. Additional Outputs
    499. +
    500. Guided Decoding
    501. Speculative Decoding
    502. Checkpoint Loading
    503. AutoDeploy (Prototype)
    504. Ray Orchestrator (Prototype)
    505. Torch Compile & Piecewise CUDA Graph
    506. +
    507. Helix Parallelism
    508. +
    509. KV Cache Connector
    510. Developer Guide

      @@ -411,11 +413,14 @@
    511. Quantization
    512. Sampling
    513. Additional Outputs
    514. +
    515. Guided Decoding
    516. Speculative Decoding
    517. Checkpoint Loading
    518. AutoDeploy (Prototype)
    519. Ray Orchestrator (Prototype)
    520. Torch Compile & Piecewise CUDA Graph
    521. +
    522. Helix Parallelism
    523. +
    524. KV Cache Connector
    525. Developer Guide

      @@ -415,11 +417,14 @@
    526. Quantization
    527. Sampling
    528. Additional Outputs
    529. +
    530. Guided Decoding
    531. Speculative Decoding
    532. Checkpoint Loading
    533. AutoDeploy (Prototype)
    534. Ray Orchestrator (Prototype)
    535. Torch Compile & Piecewise CUDA Graph
    536. +
    537. Helix Parallelism
    538. +
    539. KV Cache Connector
    540. Developer Guide

      @@ -415,11 +417,14 @@
    541. Quantization
    542. Sampling
    543. Additional Outputs
    544. +
    545. Guided Decoding
    546. Speculative Decoding
    547. Checkpoint Loading
    548. AutoDeploy (Prototype)
    549. Ray Orchestrator (Prototype)
    550. Torch Compile & Piecewise CUDA Graph
    551. +
    552. Helix Parallelism
    553. +
    554. KV Cache Connector
    555. Developer Guide