From 1d509f9205782f9683dde1154bdd70e6ad7cd7c1 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 7 Aug 2025 06:26:14 +0000 Subject: [PATCH] Update GitHub pages in root to v1.0.0rc6 --- .buildinfo | 2 +- _cpp_gen/executor.html | 5368 ++--- _cpp_gen/runtime.html | 17837 +++++++++------- .../attention.py | 249 +- .../b6815cf245cc7dc7a26a6f727fdc2dc4/model.py | 39 +- .../model_engine.py | 50 +- .../llm_args.py | 57 +- _modules/index.html | 14 +- _modules/tensorrt_llm/builder.html | 14 +- .../tensorrt_llm/disaggregated_params.html | 14 +- _modules/tensorrt_llm/executor/result.html | 14 +- _modules/tensorrt_llm/executor/utils.html | 14 +- _modules/tensorrt_llm/functional.html | 14 +- _modules/tensorrt_llm/layers/activation.html | 14 +- _modules/tensorrt_llm/layers/attention.html | 14 +- _modules/tensorrt_llm/layers/cast.html | 14 +- _modules/tensorrt_llm/layers/conv.html | 14 +- _modules/tensorrt_llm/layers/embedding.html | 14 +- _modules/tensorrt_llm/layers/linear.html | 14 +- _modules/tensorrt_llm/layers/mlp.html | 14 +- .../tensorrt_llm/layers/normalization.html | 14 +- _modules/tensorrt_llm/layers/pooling.html | 14 +- _modules/tensorrt_llm/llmapi/build_cache.html | 14 +- _modules/tensorrt_llm/llmapi/llm.html | 33 +- _modules/tensorrt_llm/llmapi/llm_args.html | 80 +- _modules/tensorrt_llm/llmapi/mpi_session.html | 14 +- .../tensorrt_llm/models/baichuan/model.html | 14 +- _modules/tensorrt_llm/models/bert/model.html | 14 +- _modules/tensorrt_llm/models/bloom/model.html | 14 +- .../tensorrt_llm/models/chatglm/config.html | 14 +- .../tensorrt_llm/models/chatglm/model.html | 14 +- _modules/tensorrt_llm/models/clip/model.html | 14 +- .../tensorrt_llm/models/cogvlm/config.html | 14 +- .../tensorrt_llm/models/cogvlm/model.html | 14 +- .../tensorrt_llm/models/commandr/model.html | 14 +- _modules/tensorrt_llm/models/dbrx/config.html | 14 +- _modules/tensorrt_llm/models/dbrx/model.html | 14 +- .../models/deepseek_v1/model.html | 14 +- .../models/deepseek_v2/model.html | 14 +- _modules/tensorrt_llm/models/dit/model.html | 14 +- _modules/tensorrt_llm/models/eagle/model.html | 14 +- .../tensorrt_llm/models/enc_dec/model.html | 14 +- .../tensorrt_llm/models/falcon/config.html | 14 +- .../tensorrt_llm/models/falcon/model.html | 14 +- .../tensorrt_llm/models/gemma/config.html | 14 +- _modules/tensorrt_llm/models/gemma/model.html | 14 +- _modules/tensorrt_llm/models/gpt/config.html | 14 +- _modules/tensorrt_llm/models/gpt/model.html | 14 +- _modules/tensorrt_llm/models/gptj/config.html | 14 +- _modules/tensorrt_llm/models/gptj/model.html | 14 +- .../tensorrt_llm/models/gptneox/model.html | 14 +- .../tensorrt_llm/models/llama/config.html | 14 +- _modules/tensorrt_llm/models/llama/model.html | 14 +- _modules/tensorrt_llm/models/mamba/model.html | 14 +- .../tensorrt_llm/models/medusa/config.html | 14 +- .../tensorrt_llm/models/medusa/model.html | 14 +- .../tensorrt_llm/models/mllama/model.html | 14 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 14 +- .../tensorrt_llm/models/modeling_utils.html | 14 +- _modules/tensorrt_llm/models/mpt/model.html | 14 +- .../models/multimodal_encoders/config.html | 14 +- .../models/multimodal_encoders/model.html | 14 +- _modules/tensorrt_llm/models/opt/model.html | 14 +- _modules/tensorrt_llm/models/phi/model.html | 14 +- _modules/tensorrt_llm/models/phi3/model.html | 14 +- .../models/recurrentgemma/model.html | 14 +- .../tensorrt_llm/models/redrafter/model.html | 14 +- _modules/tensorrt_llm/plugin/plugin.html | 14 +- _modules/tensorrt_llm/quantization/mode.html | 14 +- .../quantization/quantize_by_modelopt.html | 14 +- .../runtime/enc_dec_model_runner.html | 14 +- _modules/tensorrt_llm/runtime/generation.html | 14 +- .../runtime/kv_cache_manager.html | 14 +- .../tensorrt_llm/runtime/model_runner.html | 14 +- .../runtime/model_runner_cpp.html | 14 +- .../runtime/multimodal_model_runner.html | 14 +- _modules/tensorrt_llm/runtime/session.html | 14 +- _modules/tensorrt_llm/sampling_params.html | 71 +- _sources/_cpp_gen/executor.rst.txt | 42 +- _sources/_cpp_gen/runtime.rst.txt | 314 +- ..._Expert_Parallelism_in_TensorRT-LLM.md.txt | 5 +- ...rmance_Analysis_And_Auto_Enablement.md.txt | 186 + ...t_Parallelism_in_TensorRT-LLM_part2.md.txt | 322 + .../blog9_Deploying_GPT_OSS_on_TRTLLM.md.txt | 362 + _sources/commands/trtllm-serve/index.rst.txt | 9 + .../run-benchmark-with-trtllm-serve.md.txt | 222 + .../trtllm-serve/trtllm-serve.rst.txt | 263 + ...enai_completion_client_json_schema.rst.txt | 2 +- _sources/index.rst.txt | 4 +- _sources/installation/linux.md.txt | 20 +- _sources/llm-api/reference.rst.txt | 12 +- _sources/performance/perf-analysis.md.txt | 4 +- _sources/performance/perf-overview.md.txt | 187 +- _sources/quick-start-guide.md.txt | 97 +- _sources/reference/support-matrix.md.txt | 2 +- _sources/release-notes.md.txt | 1 + .../feature_combination_matrix.md.txt | 6 +- advanced/disaggregated-service.html | 14 +- advanced/executor.html | 14 +- advanced/expert-parallelism.html | 14 +- advanced/gpt-attention.html | 14 +- advanced/gpt-runtime.html | 14 +- advanced/graph-rewriting.html | 14 +- advanced/kv-cache-management.html | 14 +- advanced/kv-cache-reuse.html | 14 +- advanced/lora.html | 14 +- advanced/lowprecision-pcie-allreduce.html | 14 +- advanced/open-sourced-cutlass-kernels.html | 14 +- advanced/speculative-decoding.html | 14 +- advanced/weight-streaming.html | 14 +- architecture/add-model.html | 14 +- architecture/checkpoint.html | 14 +- architecture/core-concepts.html | 14 +- architecture/model-weights-loader.html | 14 +- architecture/overview.html | 20 +- architecture/workflow.html | 14 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 14 +- blogs/Falcon180B-H200.html | 14 +- blogs/H100vsA100.html | 14 +- blogs/H200launch.html | 14 +- blogs/XQA-kernel.html | 14 +- blogs/quantization-in-TRT-LLM.html | 14 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 14 +- ...1_MTP_Implementation_and_Optimization.html | 14 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 14 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 22 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 14 +- .../blog6_Llama4_maverick_eagle_guide.html | 14 +- ...formance_Analysis_And_Auto_Enablement.html | 930 + ...ert_Parallelism_in_TensorRT-LLM_part2.html | 975 + .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 996 + commands/trtllm-bench.html | 18 +- commands/trtllm-build.html | 18 +- commands/trtllm-serve/index.html | 656 + .../run-benchmark-with-trtllm-serve.html | 904 + commands/trtllm-serve/trtllm-serve.html | 1127 + dev-on-cloud/build-image-to-dockerhub.html | 14 +- dev-on-cloud/dev-on-runpod.html | 14 +- examples/curl_chat_client.html | 14 +- examples/curl_chat_client_for_multimodal.html | 14 +- examples/curl_completion_client.html | 14 +- examples/customization.html | 14 +- examples/deepseek_r1_reasoning_parser.html | 14 +- examples/genai_perf_client.html | 14 +- .../genai_perf_client_for_multimodal.html | 14 +- examples/index.html | 14 +- examples/llm_api_examples.html | 14 +- examples/llm_guided_decoding.html | 14 +- examples/llm_inference.html | 14 +- examples/llm_inference_async.html | 14 +- examples/llm_inference_async_streaming.html | 14 +- examples/llm_inference_distributed.html | 14 +- examples/llm_logits_processor.html | 14 +- examples/llm_mgmn_llm_distributed.html | 14 +- examples/llm_mgmn_trtllm_bench.html | 14 +- examples/llm_mgmn_trtllm_serve.html | 14 +- examples/llm_multilora.html | 14 +- examples/llm_runtime.html | 14 +- examples/llm_sampling.html | 14 +- examples/llm_speculative_decoding.html | 14 +- examples/openai_chat_client.html | 14 +- .../openai_chat_client_for_multimodal.html | 14 +- examples/openai_completion_client.html | 14 +- .../openai_completion_client_for_lora.html | 14 +- .../openai_completion_client_json_schema.html | 102 +- examples/trtllm_serve_examples.html | 14 +- genindex.html | 581 +- index.html | 50 +- installation/build-from-source-linux.html | 14 +- installation/containers.html | 16 +- installation/linux.html | 31 +- key-features.html | 14 +- llm-api/index.html | 14 +- llm-api/reference.html | 720 +- objects.inv | Bin 155783 -> 163168 bytes overview.html | 14 +- performance/perf-analysis.html | 18 +- performance/perf-benchmarking.html | 14 +- performance/perf-overview.html | 476 +- .../benchmarking-default-performance.html | 14 +- .../deciding-model-sharding-strategy.html | 14 +- .../fp8-quantization.html | 14 +- .../performance-tuning-guide/index.html | 14 +- ...ing-max-batch-size-and-max-num-tokens.html | 14 +- .../useful-build-time-flags.html | 14 +- .../useful-runtime-flags.html | 14 +- py-modindex.html | 14 +- python-api/tensorrt_llm.functional.html | 14 +- python-api/tensorrt_llm.layers.html | 14 +- python-api/tensorrt_llm.models.html | 14 +- python-api/tensorrt_llm.plugin.html | 14 +- python-api/tensorrt_llm.quantization.html | 14 +- python-api/tensorrt_llm.runtime.html | 14 +- quick-start-guide.html | 142 +- reference/ci-overview.html | 14 +- reference/dev-containers.html | 14 +- reference/memory.html | 14 +- reference/precision.html | 14 +- reference/support-matrix.html | 16 +- reference/troubleshooting.html | 14 +- release-notes.html | 15 +- search.html | 14 +- searchindex.js | 2 +- torch.html | 14 +- torch/adding_new_model.html | 14 +- torch/arch_overview.html | 14 +- torch/attention.html | 14 +- .../features/feature_combination_matrix.html | 20 +- torch/features/overlap_scheduler.html | 14 +- torch/features/quantization.html | 14 +- torch/features/sampling.html | 14 +- torch/kv_cache_manager.html | 14 +- torch/scheduler.html | 14 +- 213 files changed, 22996 insertions(+), 12945 deletions(-) create mode 100644 _sources/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md.txt create mode 100644 _sources/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md.txt create mode 100644 _sources/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md.txt create mode 100644 _sources/commands/trtllm-serve/index.rst.txt create mode 100644 _sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt create mode 100644 _sources/commands/trtllm-serve/trtllm-serve.rst.txt create mode 100644 blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html create mode 100644 blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html create mode 100644 blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html create mode 100644 commands/trtllm-serve/index.html create mode 100644 commands/trtllm-serve/run-benchmark-with-trtllm-serve.html create mode 100644 commands/trtllm-serve/trtllm-serve.html diff --git a/.buildinfo b/.buildinfo index 255f2b4d24..57ce40362c 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 04d569d8861c27285138a24e2af3e496 +config: 4c4e434803756ce4857c43609ad607a5 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html index 6670d5d2e8..dca5128f63 100644 --- a/_cpp_gen/executor.html +++ b/_cpp_gen/executor.html @@ -59,7 +59,7 @@ @@ -71,7 +71,7 @@ - + @@ -391,7 +391,11 @@
  • trtllm-bench
  • trtllm-build
  • -
  • trtllm-serve
  • +
  • trtllm-serve
    +
  • Architecture

    Architecture

    Architecture

  • @@ -532,7 +536,7 @@
  • @@ -548,9 +552,9 @@ --config_file
  • @@ -562,6 +566,13 @@
  • trtllm-bench-latency command line option
  • trtllm-bench-throughput command line option +
  • + +
  • + --disable_chunked_context + +
  • @@ -600,7 +611,7 @@ --ep_size
  • @@ -609,28 +620,28 @@
  • --fail_fast_on_attention_window_too_large
  • --gpus_per_node
  • --host
  • @@ -655,7 +666,7 @@ --kv_cache_free_gpu_memory_fraction
  • @@ -664,11 +675,11 @@
  • @@ -679,14 +690,14 @@
  • trtllm-bench-throughput command line option
  • -
  • trtllm-serve-serve command line option +
  • trtllm-serve-serve command line option
  • --max_beam_width
  • @@ -706,7 +717,7 @@
  • trtllm-bench-throughput command line option
  • -
  • trtllm-serve-serve command line option +
  • trtllm-serve-serve command line option
  • @@ -719,7 +730,7 @@
  • trtllm-bench-throughput command line option
  • -
  • trtllm-serve-serve command line option +
  • trtllm-serve-serve command line option
  • @@ -733,9 +744,9 @@ --metadata_server_config_file
  • @@ -747,6 +758,8 @@
  • trtllm-bench-throughput command line option
  • + + - -