From 2af4947777e62f735814804f5aab74dac1d1d3df Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Tue, 23 Dec 2025 02:41:10 +0000 Subject: [PATCH] Update latest GitHub pages to v1.2.0rc6 --- latest/.buildinfo | 2 +- latest/_cpp_gen/executor.html | 50 +- latest/_cpp_gen/runtime.html | 10 +- .../attention.py | 28 + .../model_engine.py | 120 +- latest/_modules/index.html | 13 +- .../tensorrt_llm/_torch/async_llm.html | 770 +++++++ latest/_modules/tensorrt_llm/builder.html | 10 +- .../tensorrt_llm/disaggregated_params.html | 10 +- .../tensorrt_llm/executor/request.html | 10 +- .../tensorrt_llm/executor/result.html | 10 +- .../_modules/tensorrt_llm/executor/utils.html | 10 +- latest/_modules/tensorrt_llm/functional.html | 20 +- .../tensorrt_llm/layers/activation.html | 10 +- .../tensorrt_llm/layers/attention.html | 10 +- latest/_modules/tensorrt_llm/layers/cast.html | 10 +- latest/_modules/tensorrt_llm/layers/conv.html | 10 +- .../tensorrt_llm/layers/embedding.html | 10 +- .../_modules/tensorrt_llm/layers/linear.html | 10 +- latest/_modules/tensorrt_llm/layers/mlp.html | 10 +- .../tensorrt_llm/layers/normalization.html | 10 +- .../_modules/tensorrt_llm/layers/pooling.html | 10 +- .../tensorrt_llm/llmapi/build_cache.html | 10 +- latest/_modules/tensorrt_llm/llmapi/llm.html | 13 +- .../tensorrt_llm/llmapi/llm_args.html | 168 +- .../tensorrt_llm/llmapi/mm_encoder.html | 10 +- .../tensorrt_llm/llmapi/mpi_session.html | 10 +- .../tensorrt_llm/models/baichuan/model.html | 10 +- .../tensorrt_llm/models/bert/model.html | 10 +- .../tensorrt_llm/models/bloom/model.html | 10 +- .../tensorrt_llm/models/chatglm/config.html | 10 +- .../tensorrt_llm/models/chatglm/model.html | 10 +- .../tensorrt_llm/models/clip/model.html | 10 +- .../tensorrt_llm/models/cogvlm/config.html | 10 +- .../tensorrt_llm/models/cogvlm/model.html | 10 +- .../tensorrt_llm/models/commandr/model.html | 10 +- .../tensorrt_llm/models/dbrx/config.html | 10 +- .../tensorrt_llm/models/dbrx/model.html | 10 +- .../models/deepseek_v1/model.html | 10 +- .../models/deepseek_v2/model.html | 10 +- .../tensorrt_llm/models/dit/model.html | 10 +- .../tensorrt_llm/models/eagle/model.html | 10 +- .../tensorrt_llm/models/enc_dec/model.html | 10 +- .../tensorrt_llm/models/falcon/config.html | 10 +- .../tensorrt_llm/models/falcon/model.html | 10 +- .../tensorrt_llm/models/gemma/config.html | 10 +- .../tensorrt_llm/models/gemma/model.html | 10 +- .../tensorrt_llm/models/gpt/config.html | 10 +- .../tensorrt_llm/models/gpt/model.html | 10 +- .../tensorrt_llm/models/gptj/config.html | 10 +- .../tensorrt_llm/models/gptj/model.html | 10 +- .../tensorrt_llm/models/gptneox/model.html | 10 +- .../tensorrt_llm/models/llama/config.html | 10 +- .../tensorrt_llm/models/llama/model.html | 10 +- .../tensorrt_llm/models/mamba/model.html | 10 +- .../tensorrt_llm/models/medusa/config.html | 10 +- .../tensorrt_llm/models/medusa/model.html | 10 +- .../tensorrt_llm/models/mllama/model.html | 10 +- .../tensorrt_llm/models/mmdit_sd3/model.html | 10 +- .../tensorrt_llm/models/modeling_utils.html | 10 +- .../tensorrt_llm/models/mpt/model.html | 10 +- .../models/multimodal_encoders/config.html | 10 +- .../models/multimodal_encoders/model.html | 10 +- .../tensorrt_llm/models/opt/model.html | 10 +- .../tensorrt_llm/models/phi/model.html | 10 +- .../tensorrt_llm/models/phi3/model.html | 10 +- .../models/recurrentgemma/model.html | 10 +- .../tensorrt_llm/models/redrafter/model.html | 10 +- .../_modules/tensorrt_llm/plugin/plugin.html | 10 +- .../tensorrt_llm/quantization/mode.html | 10 +- .../quantization/quantize_by_modelopt.html | 10 +- .../runtime/enc_dec_model_runner.html | 10 +- .../tensorrt_llm/runtime/generation.html | 10 +- .../runtime/kv_cache_manager.html | 10 +- .../tensorrt_llm/runtime/model_runner.html | 11 +- .../runtime/model_runner_cpp.html | 10 +- .../runtime/multimodal_model_runner.html | 10 +- .../tensorrt_llm/runtime/session.html | 10 +- .../tensorrt_llm/sampling_params.html | 21 +- ...tice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt | 32 +- ...t_Parallelism_in_TensorRT-LLM_part3.md.txt | 2 +- ...-R1_Performance_on_NVIDIA_B200_GPUs.md.txt | 2 +- ...Throughput_on_NVIDIA_Blackwell_GPUs.md.txt | 2 +- .../run-benchmark-with-trtllm-serve.md.txt | 2 +- .../trtllm-serve/trtllm-serve.rst.txt | 20 +- .../deployment-guide/config_table.rst.txt | 1074 +++++++++ ...ent-guide-for-deepseek-r1-on-trtllm.md.txt | 32 +- ...loyment-guide-for-gpt-oss-on-trtllm.md.txt | 24 +- ...uide-for-kimi-k2-thinking-on-trtllm.md.txt | 15 + ...nt-guide-for-llama3.3-70b-on-trtllm.md.txt | 6 +- ...nt-guide-for-llama4-scout-on-trtllm.md.txt | 6 +- ...ment-guide-for-qwen3-next-on-trtllm.md.txt | 4 +- ...eployment-guide-for-qwen3-on-trtllm.md.txt | 4 +- .../_sources/deployment-guide/index.rst.txt | 60 +- .../deployment-guide/note_sections.rst.txt | 36 + .../developer-guide/perf-analysis.md.txt | 10 +- .../developer-guide/perf-benchmarking.md.txt | 27 +- .../developer-guide/perf-overview.md.txt | 2 +- .../examples/curl_chat_client.rst.txt | 2 +- .../curl_chat_client_for_multimodal.rst.txt | 2 +- .../examples/curl_completion_client.rst.txt | 2 +- .../examples/curl_responses_client.rst.txt | 10 + .../deepseek_r1_reasoning_parser.rst.txt | 2 +- .../examples/genai_perf_client.rst.txt | 2 +- .../genai_perf_client_for_multimodal.rst.txt | 2 +- .../examples/llm_guided_decoding.rst.txt | 2 +- .../_sources/examples/llm_inference.rst.txt | 2 +- .../examples/llm_inference_async.rst.txt | 2 +- .../llm_inference_async_streaming.rst.txt | 2 +- .../llm_inference_distributed.rst.txt | 2 +- .../examples/llm_kv_cache_connector.rst.txt | 2 +- .../examples/llm_kv_cache_offloading.rst.txt | 2 +- .../examples/llm_logits_processor.rst.txt | 2 +- .../examples/llm_mgmn_llm_distributed.rst.txt | 2 +- .../examples/llm_mgmn_trtllm_bench.rst.txt | 4 +- .../examples/llm_mgmn_trtllm_serve.rst.txt | 2 +- .../_sources/examples/llm_multilora.rst.txt | 2 +- latest/_sources/examples/llm_runtime.rst.txt | 2 +- latest/_sources/examples/llm_sampling.rst.txt | 2 +- .../examples/llm_sparse_attention.rst.txt | 2 +- .../examples/llm_speculative_decoding.rst.txt | 2 +- .../examples/openai_chat_client.rst.txt | 2 +- .../openai_chat_client_for_multimodal.rst.txt | 2 +- .../examples/openai_completion_client.rst.txt | 2 +- .../openai_completion_client_for_lora.rst.txt | 2 +- ...enai_completion_client_json_schema.rst.txt | 2 +- .../examples/openai_responses_client.rst.txt | 10 + .../examples/trtllm_serve_examples.rst.txt | 2 + .../auto_deploy/support_matrix.md.txt | 2 +- latest/_sources/features/quantization.md.txt | 95 +- .../legacy/performance/perf-analysis.md.txt | 8 +- .../performance/perf-benchmarking.md.txt | 6 +- .../legacy/reference/support-matrix.md.txt | 1 + latest/_sources/llm-api/reference.rst.txt | 12 +- .../_sources/models/supported-models.md.txt | 2 + latest/_sources/overview.md.txt | 6 +- latest/_sources/quick-start-guide.md.txt | 2 +- .../advanced/expert_configurations.md.txt | 19 + .../torch/auto_deploy/support_matrix.md.txt | 2 +- .../torch/features/quantization.md.txt | 6 +- ...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 40 +- latest/blogs/Falcon180B-H200.html | 10 +- latest/blogs/H100vsA100.html | 10 +- latest/blogs/H200launch.html | 10 +- latest/blogs/XQA-kernel.html | 10 +- latest/blogs/quantization-in-TRT-LLM.html | 10 +- .../blog10_ADP_Balance_Strategy.html | 10 +- .../tech_blog/blog11_GPT_OSS_Eagle3.html | 10 +- ...ded_Decoding_and_Speculative_Decoding.html | 10 +- ...ompute_Implementation_in_TensorRT-LLM.html | 10 +- ...ert_Parallelism_in_TensorRT-LLM_part3.html | 12 +- ...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 12 +- ...1_MTP_Implementation_and_Optimization.html | 10 +- ...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 12 +- ...ng_Expert_Parallelism_in_TensorRT-LLM.html | 10 +- ...Disaggregated_Serving_in_TensorRT-LLM.html | 10 +- .../blog6_Llama4_maverick_eagle_guide.html | 10 +- ...formance_Analysis_And_Auto_Enablement.html | 10 +- ...ert_Parallelism_in_TensorRT-LLM_part2.html | 10 +- .../blog9_Deploying_GPT_OSS_on_TRTLLM.html | 10 +- latest/commands/trtllm-bench.html | 10 +- latest/commands/trtllm-build.html | 10 +- latest/commands/trtllm-eval.html | 10 +- latest/commands/trtllm-serve/index.html | 10 +- .../run-benchmark-with-trtllm-serve.html | 12 +- .../commands/trtllm-serve/trtllm-serve.html | 51 +- latest/deployment-guide/config_table.html | 1928 +++++++++++++++++ ...yment-guide-for-deepseek-r1-on-trtllm.html | 429 +++- ...eployment-guide-for-gpt-oss-on-trtllm.html | 903 +++++++- ...-guide-for-kimi-k2-thinking-on-trtllm.html | 25 +- ...ment-guide-for-llama3.3-70b-on-trtllm.html | 14 +- ...ment-guide-for-llama4-scout-on-trtllm.html | 14 +- ...oyment-guide-for-qwen3-next-on-trtllm.html | 12 +- .../deployment-guide-for-qwen3-on-trtllm.html | 12 +- latest/deployment-guide/index.html | 1334 +++++++++++- latest/deployment-guide/note_sections.html | 678 ++++++ latest/developer-guide/api-change.html | 10 +- latest/developer-guide/ci-overview.html | 10 +- latest/developer-guide/dev-containers.html | 10 +- latest/developer-guide/kv-transfer.html | 10 +- latest/developer-guide/overview.html | 10 +- latest/developer-guide/perf-analysis.html | 20 +- latest/developer-guide/perf-benchmarking.html | 45 +- latest/developer-guide/perf-overview.html | 16 +- latest/examples/curl_chat_client.html | 12 +- .../curl_chat_client_for_multimodal.html | 12 +- latest/examples/curl_completion_client.html | 18 +- latest/examples/curl_responses_client.html | 679 ++++++ latest/examples/customization.html | 10 +- .../deepseek_r1_reasoning_parser.html | 18 +- latest/examples/dynamo_k8s_example.html | 16 +- latest/examples/genai_perf_client.html | 12 +- .../genai_perf_client_for_multimodal.html | 12 +- latest/examples/index.html | 12 +- latest/examples/kvcacheconfig.html | 10 +- latest/examples/kvcacheretentionconfig.html | 10 +- latest/examples/llm_api_examples.html | 10 +- latest/examples/llm_guided_decoding.html | 12 +- latest/examples/llm_inference.html | 14 +- latest/examples/llm_inference_async.html | 12 +- .../llm_inference_async_streaming.html | 12 +- .../examples/llm_inference_distributed.html | 12 +- latest/examples/llm_kv_cache_connector.html | 12 +- latest/examples/llm_kv_cache_offloading.html | 12 +- latest/examples/llm_logits_processor.html | 12 +- latest/examples/llm_mgmn_llm_distributed.html | 12 +- latest/examples/llm_mgmn_trtllm_bench.html | 127 +- latest/examples/llm_mgmn_trtllm_serve.html | 12 +- latest/examples/llm_multilora.html | 12 +- latest/examples/llm_runtime.html | 12 +- latest/examples/llm_sampling.html | 12 +- latest/examples/llm_sparse_attention.html | 12 +- latest/examples/llm_speculative_decoding.html | 12 +- latest/examples/openai_chat_client.html | 12 +- .../openai_chat_client_for_multimodal.html | 12 +- latest/examples/openai_completion_client.html | 12 +- .../openai_completion_client_for_lora.html | 12 +- .../openai_completion_client_json_schema.html | 18 +- latest/examples/openai_responses_client.html | 684 ++++++ latest/examples/trtllm_serve_examples.html | 12 +- latest/features/additional-outputs.html | 10 +- latest/features/attention.html | 12 +- .../benchmarking_with_trtllm_bench.html | 10 +- .../auto_deploy/advanced/example_run.html | 10 +- .../advanced/expert_configurations.html | 10 +- .../auto_deploy/advanced/logging.html | 10 +- .../auto_deploy/advanced/workflow.html | 10 +- latest/features/auto_deploy/auto-deploy.html | 10 +- .../features/auto_deploy/support_matrix.html | 12 +- latest/features/checkpoint-loading.html | 10 +- latest/features/disagg-serving.html | 10 +- .../features/feature-combination-matrix.html | 10 +- latest/features/guided-decoding.html | 10 +- latest/features/helix.html | 10 +- latest/features/kv-cache-connector.html | 10 +- latest/features/kvcache.html | 10 +- latest/features/long-sequence.html | 10 +- latest/features/lora.html | 10 +- latest/features/multi-modality.html | 14 +- latest/features/overlap-scheduler.html | 10 +- .../paged-attention-ifb-scheduler.html | 14 +- latest/features/parallel-strategy.html | 10 +- latest/features/quantization.html | 76 +- latest/features/ray-orchestrator.html | 10 +- latest/features/sampling.html | 16 +- latest/features/speculative-decoding.html | 10 +- ...orch_compile_and_piecewise_cuda_graph.html | 10 +- latest/genindex.html | 148 +- latest/index.html | 14 +- .../installation/build-from-source-linux.html | 12 +- latest/installation/containers.html | 12 +- latest/installation/index.html | 10 +- latest/installation/linux.html | 12 +- .../advanced/disaggregated-service.html | 10 +- latest/legacy/advanced/executor.html | 20 +- .../legacy/advanced/expert-parallelism.html | 10 +- latest/legacy/advanced/gpt-attention.html | 14 +- latest/legacy/advanced/gpt-runtime.html | 10 +- latest/legacy/advanced/graph-rewriting.html | 10 +- .../legacy/advanced/kv-cache-management.html | 10 +- latest/legacy/advanced/kv-cache-reuse.html | 10 +- latest/legacy/advanced/lora.html | 10 +- .../advanced/lowprecision-pcie-allreduce.html | 10 +- .../open-sourced-cutlass-kernels.html | 10 +- .../legacy/advanced/speculative-decoding.html | 10 +- latest/legacy/advanced/weight-streaming.html | 10 +- latest/legacy/architecture/add-model.html | 10 +- latest/legacy/architecture/checkpoint.html | 10 +- latest/legacy/architecture/core-concepts.html | 20 +- .../architecture/model-weights-loader.html | 10 +- latest/legacy/architecture/workflow.html | 10 +- .../build-image-to-dockerhub.html | 10 +- latest/legacy/dev-on-cloud/dev-on-runpod.html | 10 +- latest/legacy/key-features.html | 10 +- latest/legacy/performance/perf-analysis.html | 18 +- .../legacy/performance/perf-benchmarking.html | 16 +- .../benchmarking-default-performance.html | 10 +- .../deciding-model-sharding-strategy.html | 10 +- .../fp8-quantization.html | 10 +- .../performance-tuning-guide/index.html | 10 +- .../introduction.html | 10 +- ...ing-max-batch-size-and-max-num-tokens.html | 10 +- .../useful-build-time-flags.html | 10 +- .../useful-runtime-flags.html | 10 +- .../python-api/tensorrt_llm.functional.html | 16 +- .../python-api/tensorrt_llm.layers.html | 10 +- .../python-api/tensorrt_llm.models.html | 10 +- .../python-api/tensorrt_llm.plugin.html | 10 +- .../python-api/tensorrt_llm.quantization.html | 10 +- .../python-api/tensorrt_llm.runtime.html | 10 +- latest/legacy/reference/memory.html | 14 +- .../multimodal-feature-support-matrix.html | 10 +- latest/legacy/reference/precision.html | 30 +- latest/legacy/reference/support-matrix.html | 11 +- latest/legacy/reference/troubleshooting.html | 10 +- latest/legacy/tensorrt_quickstart.html | 12 +- latest/legacy/torch.html | 10 +- latest/llm-api/index.html | 12 +- latest/llm-api/reference.html | 535 ++++- latest/models/adding-new-model.html | 10 +- latest/models/supported-models.html | 72 +- latest/objects.inv | Bin 182848 -> 183648 bytes latest/overview.html | 14 +- latest/py-modindex.html | 10 +- latest/quick-start-guide.html | 14 +- latest/release-notes.html | 10 +- latest/search.html | 10 +- latest/searchindex.js | 2 +- latest/torch/adding_new_model.html | 10 +- latest/torch/arch_overview.html | 10 +- latest/torch/attention.html | 10 +- .../benchmarking_with_trtllm_bench.html | 10 +- .../auto_deploy/advanced/example_run.html | 10 +- .../advanced/expert_configurations.html | 28 +- .../torch/auto_deploy/advanced/logging.html | 10 +- .../advanced/serving_with_trtllm_serve.html | 10 +- .../torch/auto_deploy/advanced/workflow.html | 10 +- latest/torch/auto_deploy/auto-deploy.html | 10 +- latest/torch/auto_deploy/support_matrix.html | 12 +- latest/torch/features/checkpoint_loading.html | 10 +- latest/torch/features/lora.html | 10 +- latest/torch/features/overlap_scheduler.html | 10 +- latest/torch/features/quantization.html | 16 +- latest/torch/features/sampling.html | 10 +- latest/torch/kv_cache_manager.html | 10 +- latest/torch/scheduler.html | 10 +- 326 files changed, 11644 insertions(+), 1540 deletions(-) create mode 100644 latest/_modules/tensorrt_llm/_torch/async_llm.html create mode 100644 latest/_sources/deployment-guide/config_table.rst.txt create mode 100644 latest/_sources/deployment-guide/note_sections.rst.txt create mode 100644 latest/_sources/examples/curl_responses_client.rst.txt create mode 100644 latest/_sources/examples/openai_responses_client.rst.txt create mode 100644 latest/deployment-guide/config_table.html create mode 100644 latest/deployment-guide/note_sections.html create mode 100644 latest/examples/curl_responses_client.html create mode 100644 latest/examples/openai_responses_client.html diff --git a/latest/.buildinfo b/latest/.buildinfo index 239d52c49f..ea8c7b4edc 100644 --- a/latest/.buildinfo +++ b/latest/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: e432c3509163ef03323e39d8537d99ca +config: 370ff5f62df7a02937391c16812e12e3 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html index e8b4a9df9a..b60b4a4619 100644 --- a/latest/_cpp_gen/executor.html +++ b/latest/_cpp_gen/executor.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1310,11 +1312,6 @@ using TensorPtr = std::shared_ptr<Tensor>#
    -
    -
    -using SizeType32 = std::int32_t#
    -
    -
    using SizeType64 = std::int64_t#
    @@ -3144,6 +3141,11 @@ virtual CommState const &getCommState() const = 0#
    +
    +
    +virtual bool isRunning() const = 0#
    +
    + @@ -3943,6 +3945,16 @@ namespace executor

    Typedefs

    +
    +
    +typedef tensorrt_llm::runtime::SizeType32 SizeType32#
    +
    + +
    +
    +using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>#
    +
    +
    using RetentionPriority = SizeType32#
    @@ -6921,8 +6933,8 @@

    Public Functions

    -
    -inline KVCacheStoredBlockData( +
    +inline KVCacheStoredBlockData(
    IdType blockHash,
    @@ -6930,9 +6942,10 @@
    std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId,
    SizeType32 cacheLevel,
    SizeType32 priority,
    +
    std::vector<MmKey> mmKeys = {},
    -)#
    +)#
    @@ -6968,6 +6981,12 @@

    The priority of the block.

    +
    +
    +std::vector<MmKey> mmKeys#
    +

    The multimodal keys of the block.

    +
    +
    @@ -12245,7 +12264,6 @@
  • types.h
  • tensorrt_llm::executor::kv_cache::DataContext
  • +
  • SizeType32
  • +
  • MmKey
  • RetentionPriority
  • KVCacheEventData
  • version()
  • @@ -13177,12 +13198,13 @@
  • tensorrt_llm::executor::KVCacheStoredBlockData
  • tensorrt_llm::executor::KVCacheStoredData
  • Dynamo K8s Example
  • @@ -14701,9 +14703,9 @@ one more than decoding draft tokens for prediction from primary head

    diff --git a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py index ed23eb7aab..383ebf8296 100644 --- a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py +++ b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py @@ -985,6 +985,14 @@ class MLA(nn.Module): is_neox=pos_embd_params.is_neox, ) + self.llama_4_scaling = False + if hasattr(config.pretrained_config, 'llama_4_scaling'): + self.llama_4_scaling = True + self.floor_scale = getattr(config.pretrained_config.llama_4_scaling, + 'original_max_position_embeddings', 8192) + self.attn_scale = getattr(config.pretrained_config.llama_4_scaling, + 'beta', 0.1) + if not config.skip_create_weights_in_init: self.create_weights() @@ -1127,6 +1135,18 @@ class MLA(nn.Module): return hidden_states.new_empty([num_tokens, hidden_size], dtype=hidden_states.dtype) + def _attention_scaling(self, q, position_ids): + + def _get_attn_scale(position_ids: torch.Tensor) -> torch.Tensor: + positions = position_ids.view(-1) + floor = torch.floor((positions + 1.0) / self.floor_scale) + attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0 + return attn_scale.unsqueeze(-1) + + attn_scale = _get_attn_scale(position_ids) + q = (q * attn_scale).to(q.dtype) + return q + def forward_impl(self, position_ids: Optional[torch.Tensor], hidden_states: torch.Tensor, @@ -1197,6 +1217,10 @@ class MLA(nn.Module): assert position_ids is not None k_pe_ctx = self.apply_rope(q_ctx, k_pe_ctx, position_ids) + if self.llama_4_scaling: + q_ctx = self._attention_scaling( + q_ctx, position_ids[..., :num_ctx_tokens]) + self.forward_context( q_ctx, compressed_kv_ctx, @@ -1217,6 +1241,10 @@ class MLA(nn.Module): assert position_ids is not None k_pe_gen = self.apply_rope(q_gen, k_pe_gen, position_ids) + if self.llama_4_scaling: + q_gen = self._attention_scaling( + q_gen, position_ids[..., num_ctx_tokens:]) + self.forward_absorption_generation( q_gen, compressed_kv_gen, diff --git a/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py b/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py index 811f11fce5..7574b8f6fd 100644 --- a/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py +++ b/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py @@ -48,7 +48,8 @@ from ..speculative import (SpecMetadata, get_num_extra_kv_tokens, get_spec_metadata, update_spec_config_from_model_config) from ..speculative.drafting_loops import BaseDraftingLoopWrapper -from ..speculative.eagle3 import Eagle3ResourceManager, Eagle3SpecMetadata +from ..speculative.eagle3 import (Eagle3OneModelSpecMetadata, + Eagle3ResourceManager, Eagle3SpecMetadata) from ..speculative.mtp import SampleStateTensorsMTP from ..speculative.utils import SpecDecodingTensor from ..utils import (get_model_extra_attrs, @@ -426,6 +427,7 @@ class PyTorchModelEngine(ModelEngine): mapping=self.mapping, dist=self.dist, kv_cache_manager_key=self.kv_cache_manager_key, + sparse_attention_config=self.sparse_attention_config, ) self.cuda_graph_runner = CUDAGraphRunner(cuda_graph_runner_config) @@ -568,13 +570,12 @@ class PyTorchModelEngine(ModelEngine): # Reset the global cuda graph dummy request to None in warmup. self.cuda_graph_runner.padding_dummy_request = None - cp_type = self.mapping.cp_config.get('cp_type', None) - if cp_type is not None: - if cp_type in [CpType.ULYSSES, CpType.STAR]: - logger.info( - "[ModelEngine::warmup] Skipping warmup for cp_type: ", - cp_type.name) - return + if self.mapping.cp_size > 1: + cp_type = self.mapping.cp_config.get("cp_type", None) + logger.info( + f"[ModelEngine::warmup] Skipping warmup for cp_type: {None if cp_type is None else cp_type.name}." + ) + return self._run_torch_compile_warmup(resource_manager) self._run_autotuner_warmup(resource_manager) @@ -625,7 +626,7 @@ class PyTorchModelEngine(ModelEngine): """Runs a forward pass to populate the autotuner cache.""" if not self.llm_args.enable_autotuner: return - + AutoTuner.get().setup_distributed_state(self.mapping, self.dist) logger.info("Running autotuner warmup...") kv_cache_manager = resource_manager.get_resource_manager( self.kv_cache_manager_key) @@ -635,8 +636,7 @@ class PyTorchModelEngine(ModelEngine): self.batch_size * (self.max_seq_len - 1)) cache_path = os.environ.get("TLLM_AUTOTUNER_CACHE_PATH", None) - with self.no_cuda_graph(), autotune(cache_path=cache_path, - rank=self.mapping.rank): + with self.no_cuda_graph(), autotune(cache_path=cache_path): warmup_request = self._create_warmup_request( resource_manager, curr_max_num_tokens, 0) with self._release_batch_context(warmup_request, @@ -704,31 +704,48 @@ class PyTorchModelEngine(ModelEngine): draft_lengths.append(0) draft_lengths = [self.max_total_draft_tokens] + # Create CUDA graphs for short and long sequences separately for sparse attention. + sparse_config = self.sparse_attention_config + if sparse_config is not None and sparse_config.needs_separate_short_long_cuda_graphs( + ): + # For short sequences, use the (seq_len_threshold - max_draft_len - 1) as the maximum sequence length + # to make sure all of the past and current input tokens are within the sequence length threshold. + # For long sequences, use the default maximum sequence length (self.max_seq_len). + max_seq_len = sparse_config.seq_len_threshold - ( + self.max_draft_len + 1) + if max_seq_len < self.max_seq_len: + max_seq_len_list = [self.max_seq_len, max_seq_len] + else: + max_seq_len_list = [self.max_seq_len] + else: + max_seq_len_list = [self.max_seq_len] + for bs in cuda_graph_batch_sizes: if bs > self.batch_size: continue for draft_len in draft_lengths: - warmup_request = self._create_cuda_graph_warmup_request( - resource_manager, bs, draft_len) - with self._release_batch_context(warmup_request, - resource_manager) as batch: - if batch is None: - # No KV cache space, cannot continue capturing graphs - return + for max_seq_len in max_seq_len_list: + warmup_request = self._create_cuda_graph_warmup_request( + resource_manager, bs, draft_len, max_seq_len) + with self._release_batch_context(warmup_request, + resource_manager) as batch: + if batch is None: + # No KV cache space, cannot continue capturing graphs + return - logger.info( - f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}" - ) + logger.info( + f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}, max_seq_len={max_seq_len}" + ) - self.enable_spec_decode = draft_len > 0 or self.is_draft_model - self._update_draft_inference_state_for_warmup( - batch, draft_len > 0, resource_manager) + self.enable_spec_decode = draft_len > 0 or self.is_draft_model + self._update_draft_inference_state_for_warmup( + batch, draft_len > 0, resource_manager) - self.forward(batch, - new_tensors_device=None, - resource_manager=resource_manager) - torch.cuda.synchronize() + self.forward(batch, + new_tensors_device=None, + resource_manager=resource_manager) + torch.cuda.synchronize() def _capture_piecewise_cuda_graphs(self, resource_manager: ResourceManager): """Captures piecewise CUDA graphs for context/prefill steps via torch.compile.""" @@ -873,8 +890,11 @@ class PyTorchModelEngine(ModelEngine): return result def _create_cuda_graph_warmup_request( - self, resource_manager: ResourceManager, batch_size: int, - draft_len: int) -> Optional[ScheduledRequests]: + self, + resource_manager: ResourceManager, + batch_size: int, + draft_len: int, + max_seq_len: int = None) -> Optional[ScheduledRequests]: """Creates a dummy ScheduledRequests tailored for CUDA graph capture.""" kv_cache_manager = resource_manager.get_resource_manager( self.kv_cache_manager_key) @@ -902,7 +922,8 @@ class PyTorchModelEngine(ModelEngine): available_tokens = kv_cache_manager.get_num_available_tokens(draft_len) # Add one dummy request with the maximum possible sequence length. - token_num = max(1, min(available_tokens, self.max_seq_len - 1)) + max_seq_len = self.max_seq_len if max_seq_len is None else max_seq_len + token_num = max(1, min(available_tokens, max_seq_len - 1)) model_config = self.model.model_config.pretrained_config max_position_embeddings = getattr(model_config, 'max_position_embeddings', None) @@ -1671,12 +1692,12 @@ class PyTorchModelEngine(ModelEngine): # Warmup doesn't have `total_input_len_cp` set because merge_helix_requests is not called. if not self.is_warmup and not request.is_cuda_graph_dummy: position_id = request.total_input_len_cp + request.py_decoding_iter - 1 - # TODO: [TRTLLM-5972] Lift the limitation that last rank is always the active one for helix. - if self.mapping.cp_rank == self.mapping.cp_size - 1: - past_seen_token_num = request.orig_prompt_len + request.py_decoding_iter - 1 + if request.py_helix_is_inactive_rank: + past_seen_token_num = request.seqlen_this_rank_cp else: - # past_seen_token_num doesn't grow on inactive ranks. - past_seen_token_num = request.orig_prompt_len + # Discount the token added to active rank in resource manager as it hasn't + # been previously seen. + past_seen_token_num = request.seqlen_this_rank_cp - 1 position_ids.append(position_id) num_cached_tokens_per_seq.append(past_seen_token_num) @@ -2015,6 +2036,11 @@ class PyTorchModelEngine(ModelEngine): attn_metadata.request_ids = request_ids attn_metadata.prompt_lens = prompt_lengths + if helix_is_inactive_rank is not None and len( + helix_is_inactive_rank) > 0: + helix_is_inactive_rank = torch.tensor(helix_is_inactive_rank, + dtype=torch.bool, + device='cuda') attn_metadata.helix_is_inactive_rank = helix_is_inactive_rank attn_metadata.num_contexts = len(scheduled_requests.context_requests) # Use num_chunked_ctx_requests to record the number of extend context requests, @@ -2089,6 +2115,9 @@ class PyTorchModelEngine(ModelEngine): num_accepted_draft_tokens)] if isinstance(spec_metadata, Eagle3SpecMetadata): spec_metadata.request_accepted_path = request_accepted_path + if isinstance(spec_metadata, Eagle3OneModelSpecMetadata): + spec_metadata.populate_sampling_params_for_one_model( + scheduled_requests.all_requests()) spec_metadata.prepare() inputs['spec_metadata'] = spec_metadata @@ -2643,7 +2672,7 @@ class PyTorchModelEngine(ModelEngine): # attn_metadata now depends on spec_metadata since it determines the shape/content of spec_dec parameter Tensors is_spec_dec_mode = spec_metadata.spec_dec_mode.attention_need_spec_dec_mode( spec_resource_manager, self.is_draft_model, self.attn_backend, - self.model_is_wrapped, spec_metadata.is_spec_dec_tree) + self.model_is_wrapped) attn_metadata.update_spec_dec_param( batch_size=scheduled_requests.batch_size, is_spec_decoding_enabled=is_spec_dec_mode, @@ -2685,6 +2714,7 @@ class PyTorchModelEngine(ModelEngine): spec_metadata=spec_metadata, draft_tokens_cuda=self.draft_tokens_cuda if self.is_spec_decode else None, + new_tensors_device=new_tensors_device, spec_resource_manager=spec_resource_manager, ) can_run_graph = key is not None @@ -2844,11 +2874,17 @@ class PyTorchModelEngine(ModelEngine): # Disable UB for unsupported platforms if not ub.ub_supported(): return False - use_nccl_symmetric = self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC" - ub.initialize_userbuffers_manager( - self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size, - self.mapping.rank, self.mapping.gpus_per_node, - hidden_size * self.max_num_tokens * 2, use_nccl_symmetric) + # NCCL_SYMMETRIC strategy no longer requires UserBuffer allocator initialization. + # It uses NCCLWindowAllocator from ncclUtils directly. + if self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC": + # Skip UB initialization for NCCL_SYMMETRIC - it uses NCCLWindowAllocator directly + return False + ub.initialize_userbuffers_manager(self.mapping.tp_size, + self.mapping.pp_size, + self.mapping.cp_size, + self.mapping.rank, + self.mapping.gpus_per_node, + hidden_size * self.max_num_tokens * 2) return True diff --git a/latest/_modules/index.html b/latest/_modules/index.html index d515f21eb2..07b5cbff1b 100644 --- a/latest/_modules/index.html +++ b/latest/_modules/index.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -513,7 +515,8 @@

    All modules for which code is available

    -
  • Dynamo K8s Example
  • @@ -725,9 +727,9 @@ diff --git a/latest/_modules/tensorrt_llm/executor/request.html b/latest/_modules/tensorrt_llm/executor/request.html index 2339a1ce92..aeee4c5b8c 100644 --- a/latest/_modules/tensorrt_llm/executor/request.html +++ b/latest/_modules/tensorrt_llm/executor/request.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -770,9 +772,9 @@ diff --git a/latest/_modules/tensorrt_llm/executor/result.html b/latest/_modules/tensorrt_llm/executor/result.html index 547b177ef4..29baf703e1 100644 --- a/latest/_modules/tensorrt_llm/executor/result.html +++ b/latest/_modules/tensorrt_llm/executor/result.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1595,9 +1597,9 @@ diff --git a/latest/_modules/tensorrt_llm/executor/utils.html b/latest/_modules/tensorrt_llm/executor/utils.html index 850f2ec8a4..6e245b4ed2 100644 --- a/latest/_modules/tensorrt_llm/executor/utils.html +++ b/latest/_modules/tensorrt_llm/executor/utils.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -801,9 +803,9 @@ diff --git a/latest/_modules/tensorrt_llm/functional.html b/latest/_modules/tensorrt_llm/functional.html index 2b995214a2..5e982347d4 100644 --- a/latest/_modules/tensorrt_llm/functional.html +++ b/latest/_modules/tensorrt_llm/functional.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -4742,7 +4744,8 @@ TWOSHOT = 5 LOWPRECISION = 6 MNNVL = 7 - NCCL_SYMMETRIC = 8 + NCCL_SYMMETRIC = 8 + SYMM_MEM = 9 # PyTorch symmetric memory with MULTIMEM @@ -4909,7 +4912,10 @@ pfc = trt.PluginFieldCollection(pfc) ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) plug_inputs = [tensor] - if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB: + if all_reduce_params.strategy not in { + AllReduceStrategy.NCCL, AllReduceStrategy.UB, + AllReduceStrategy.NCCL_SYMMETRIC + }: plug_inputs.append(workspace) if all_reduce_params.fusion_op != AllReduceFusionOp.NONE: if all_reduce_params.has_bias() == 1: @@ -4984,7 +4990,7 @@ workspace = None if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB: if current_all_reduce_helper().workspace is None: - all_reduce_params.strategy = AllReduceStrategy.NCCL + all_reduce_params.strategy = AllReduceStrategy.NCCL_SYMMETRIC else: workspace = current_all_reduce_helper().workspace.trt_tensor if all_reduce_params.strategy == AllReduceStrategy.UB: @@ -8778,9 +8784,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/activation.html b/latest/_modules/tensorrt_llm/layers/activation.html index a0b6c8fad0..e2fdfbc377 100644 --- a/latest/_modules/tensorrt_llm/layers/activation.html +++ b/latest/_modules/tensorrt_llm/layers/activation.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -652,9 +654,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/attention.html b/latest/_modules/tensorrt_llm/layers/attention.html index 67a43d26db..3df595237f 100644 --- a/latest/_modules/tensorrt_llm/layers/attention.html +++ b/latest/_modules/tensorrt_llm/layers/attention.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -3515,9 +3517,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/cast.html b/latest/_modules/tensorrt_llm/layers/cast.html index 030329811a..e66a2962bc 100644 --- a/latest/_modules/tensorrt_llm/layers/cast.html +++ b/latest/_modules/tensorrt_llm/layers/cast.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -659,9 +661,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/conv.html b/latest/_modules/tensorrt_llm/layers/conv.html index 33dbd6af41..a627960266 100644 --- a/latest/_modules/tensorrt_llm/layers/conv.html +++ b/latest/_modules/tensorrt_llm/layers/conv.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -908,9 +910,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/embedding.html b/latest/_modules/tensorrt_llm/layers/embedding.html index 8052999b69..b441d1261b 100644 --- a/latest/_modules/tensorrt_llm/layers/embedding.html +++ b/latest/_modules/tensorrt_llm/layers/embedding.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1375,9 +1377,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/linear.html b/latest/_modules/tensorrt_llm/layers/linear.html index 1f7922f3bd..e4b34ee78d 100644 --- a/latest/_modules/tensorrt_llm/layers/linear.html +++ b/latest/_modules/tensorrt_llm/layers/linear.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1223,9 +1225,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/mlp.html b/latest/_modules/tensorrt_llm/layers/mlp.html index 9c4ef5d37a..1a206c46d6 100644 --- a/latest/_modules/tensorrt_llm/layers/mlp.html +++ b/latest/_modules/tensorrt_llm/layers/mlp.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1249,9 +1251,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/normalization.html b/latest/_modules/tensorrt_llm/layers/normalization.html index 06f0167069..5a2d89b8be 100644 --- a/latest/_modules/tensorrt_llm/layers/normalization.html +++ b/latest/_modules/tensorrt_llm/layers/normalization.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1013,9 +1015,9 @@ diff --git a/latest/_modules/tensorrt_llm/layers/pooling.html b/latest/_modules/tensorrt_llm/layers/pooling.html index 832201632c..d548775ee1 100644 --- a/latest/_modules/tensorrt_llm/layers/pooling.html +++ b/latest/_modules/tensorrt_llm/layers/pooling.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -668,9 +670,9 @@ diff --git a/latest/_modules/tensorrt_llm/llmapi/build_cache.html b/latest/_modules/tensorrt_llm/llmapi/build_cache.html index 197041fdde..ff17e08a8a 100644 --- a/latest/_modules/tensorrt_llm/llmapi/build_cache.html +++ b/latest/_modules/tensorrt_llm/llmapi/build_cache.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -951,9 +953,9 @@ diff --git a/latest/_modules/tensorrt_llm/llmapi/llm.html b/latest/_modules/tensorrt_llm/llmapi/llm.html index de298f7d3f..cac57fbcd3 100644 --- a/latest/_modules/tensorrt_llm/llmapi/llm.html +++ b/latest/_modules/tensorrt_llm/llmapi/llm.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -717,7 +719,7 @@ self.mpi_session = self.args.mpi_session if self.args.parallel_config.is_multi_gpu: - if get_device_count( + if os.getenv("RAY_LOCAL_WORLD_SIZE") is None and get_device_count( ) < self.args.parallel_config.world_size_per_node: raise RuntimeError( f"Only {get_device_count()} GPUs are available, but {self.args.parallel_config.world_size} are required." @@ -753,7 +755,6 @@ self.runtime_context: Optional[_ModelRuntimeContext] = None self.llm_build_stats = LlmBuildStats() - self._build_model() except Exception: @@ -1802,9 +1803,9 @@ diff --git a/latest/_modules/tensorrt_llm/llmapi/llm_args.html b/latest/_modules/tensorrt_llm/llmapi/llm_args.html index 7e808512bb..e871d54b27 100644 --- a/latest/_modules/tensorrt_llm/llmapi/llm_args.html +++ b/latest/_modules/tensorrt_llm/llmapi/llm_args.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -537,6 +539,11 @@ from strenum import StrEnum from transformers import PreTrainedTokenizerBase +try: + from ray.util.placement_group import PlacementGroup +except ImportError: + PlacementGroup = None + from tensorrt_llm.lora_helper import (LoraConfig, get_default_trtllm_modules_to_hf_modules) @@ -707,6 +714,11 @@ """ Configuration for sparse attention. """ + seq_len_threshold: Optional[int] = Field( + default=None, + description= + "The sequence length threshold for separating short and long sequences." + ) @classmethod def from_dict(cls, data: dict): @@ -742,6 +754,15 @@ def get_indices_block_size(self) -> int: return 1 + def needs_separate_short_long_cuda_graphs(self) -> bool: + """ + Determines whether to capture a dedicated CUDA graph for batches consisting entirely of short sequences. + If True, capture distinct graphs for short-only batches and general cases (e.g., long or mixed batches). + If False, capture a single unified CUDA graph for all sequences regardless of length. + The seq_len_threshold parameter defines the cutoff boundary between short and long sequences. + """ + return False +
    [docs] @@ -801,6 +822,11 @@ description="The topk for the indexer.") indexer_max_chunk_size: Optional[int] = Field( default=None, description="The maximum chunk size for the indexer.") + # TODO: enable this by default once the memory usage in attention metadata is optimized + skip_indexer_for_short_seqs: bool = Field( + default=False, + description= + "Whether to skip the MQA and Top-K in the indexer for short sequences.")
    [docs] @@ -813,6 +839,17 @@ [docs] def supports_backend(self, backend: str) -> bool: return backend == "pytorch"
    + + +
    +[docs] + def needs_separate_short_long_cuda_graphs(self) -> bool: + """ + Whether to capture separate CUDA graphs for short and long sequences. + Use seq_len_threshold to determine the threshold for separating short and long sequences. + """ + self.seq_len_threshold = self.index_topk + return self.skip_indexer_for_short_seqs
    @@ -1180,6 +1217,10 @@ # (N = acceptance_window) drops below this value. acceptance_length_threshold: Optional[float] = None + # Prototype. If true, allows non-greedy sampling when speculation is used. Only applicable + # to 1-model code paths; non-greedy sampling is always enabled on 2-model paths. + allow_advanced_sampling: bool = False + # Validate acceptance controls at field level so they run on model creation @field_validator('acceptance_window') @classmethod @@ -1748,6 +1789,65 @@ +class RayPlacementConfig(StrictBaseModel): + """ + Configuration for Ray GPU workers placement. + This config is only used with AsyncLLM for RL scenarios. + """ + defer_workers_init: bool = Field( + default=False, + description="Defer Ray worker initialization until async setup.") + + placement_groups: Optional[List[Any]] = Field( + default=None, + description="List of Ray placement groups, one per node. " + "Each element must be a ray.util.placement_group.PlacementGroup instance." + ) + + placement_bundle_indices: Optional[List[List[int]]] = Field( + default=None, + description="List of bundle indices for each placement group. " + "Outer list corresponds to placement_groups, inner list contains bundle indices for that group." + ) + + per_worker_gpu_share: Optional[float] = Field( + default=None, + description="GPU fraction per worker for colocation scenarios. " + "Example: 0.1 means 10 actors can share one GPU. Defaults to 1.0 (one actor per GPU)." + ) + + @model_validator(mode='after') + def validate_ray_placement(self) -> 'RayPlacementConfig': + has_pgs = self.placement_groups is not None + has_indices = self.placement_bundle_indices is not None + + if has_pgs != has_indices: + raise ValueError( + "placement_groups and placement_bundle_indices must be provided together" + ) + + if has_pgs: + if len(self.placement_groups) != len(self.placement_bundle_indices): + raise ValueError( + f"placement_groups length ({len(self.placement_groups)}) must equal " + f"placement_bundle_indices length ({len(self.placement_bundle_indices)})" + ) + if PlacementGroup is not None: + for i, pg in enumerate(self.placement_groups): + if not isinstance(pg, PlacementGroup): + raise TypeError( + f"placement_groups[{i}] must be a Ray PlacementGroup, " + f"got {type(pg).__name__}") + + if self.per_worker_gpu_share is not None: + if not (0 < self.per_worker_gpu_share <= 1.0): + raise ValueError( + f"per_worker_gpu_share must be between 0 and 1.0, " + f"got {self.per_worker_gpu_share}") + + return self + + class PybindMirror(ABC): ''' A class containing the utilities for mirroring Python classes to pybinding classes. @@ -2675,9 +2775,17 @@ env_overrides: Optional[Dict[str, str]] = Field( default=None, description= - "[EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won’t update unless the code fetches them from os.environ on demand.", + "[EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won't update unless the code fetches them from os.environ on demand.", status="prototype") + @field_validator('env_overrides', mode='before') + @classmethod + def coerce_env_overrides_to_str(cls, v): + """Coerce env_overrides values to strings for os.environ compatibility.""" + if v is None: + return v + return {str(k): str(val) for k, val in v.items()} + _parallel_config: Optional[_ParallelConfig] = PrivateAttr(default=None) _model_format: Optional[_ModelFormatKind] = PrivateAttr(default=None) _speculative_model: Optional[str] = PrivateAttr(default=None) @@ -2745,6 +2853,8 @@ @field_validator("gpus_per_node", mode='before') @classmethod def validate_gpus_per_node(cls, v, info): + if os.getenv("RAY_LOCAL_WORLD_SIZE") is not None: + return info.data.get("tensor_parallel_size") if v is None: logger.warning( f"Using default gpus_per_node: {torch.cuda.device_count()}") @@ -3366,6 +3476,15 @@ "The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. Defaults to auto, which will use TorchSampler unless BeamSearch is requested.", status="beta") + sampler_force_async_worker: bool = Field( + default=False, + description="Force usage of the async worker in the sampler for D2H " + "copies, even if confidential compute is not active. Normally, the " + "async worker should only be used when confidential compute is active. " + "This argument is provided to enable it for testing purposes, " + "irrespective of confidential compute state.", + status="prototype") + enable_iter_perf_stats: bool = Field( default=False, description="Enable iteration performance statistics.", @@ -3498,6 +3617,13 @@ "Allows users to extend the functions of the RayGPUWorker class.", status="prototype") + ray_placement_config: Optional[RayPlacementConfig] = Field( + default=None, + description= + "Placement config for RayGPUWorker. Only used with AsyncLLM and orchestrator_type='ray'.", + exclude=True, + status="prototype") + enable_sleep: bool = Field( default=False, description= @@ -3763,6 +3889,27 @@ return self +
    +[docs] + @model_validator(mode='after') + def validate_helix_tokens_per_block(self) -> 'TorchLlmArgs': + """Validate that cp_config.tokens_per_block matches kv_cache_config.tokens_per_block when HELIX parallelism is active.""" + if self.context_parallel_size == 1 or self.cp_config is None or not self.cp_config: + return self + + cp_type = self.cp_config.get('cp_type', None) + if cp_type is not None and str(cp_type).upper() == 'HELIX': + cp_tokens_per_block = self.cp_config.get('tokens_per_block', None) + if cp_tokens_per_block is not None: + kv_tokens_per_block = self.kv_cache_config.tokens_per_block + assert cp_tokens_per_block == kv_tokens_per_block, ( + f"When HELIX parallelism is active, cp_config.tokens_per_block ({cp_tokens_per_block}) " + f"must match kv_cache_config.tokens_per_block ({kv_tokens_per_block})." + ) + + return self
    + +
    [docs] def warn_on_unstable_feature_usage(self) -> 'TorchLlmArgs': @@ -3855,6 +4002,17 @@ return self
    +
    +[docs] + @model_validator(mode='after') + def validate_ray_placement_config(self) -> 'TorchLlmArgs': + if self.ray_placement_config is not None and self.orchestrator_type != "ray": + raise ValueError( + "ray_placement_config is only supported with orchestrator_type='ray'" + ) + return self
    + +
    [docs] def get_executor_config( @@ -4087,9 +4245,9 @@ diff --git a/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html b/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html index 75c2a35120..8193bf85e3 100644 --- a/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html +++ b/latest/_modules/tensorrt_llm/llmapi/mm_encoder.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -778,9 +780,9 @@ diff --git a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html index 1a5bbbee79..57496f23c0 100644 --- a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html +++ b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1248,9 +1250,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/baichuan/model.html b/latest/_modules/tensorrt_llm/models/baichuan/model.html index 1feedcf89d..17b9618c91 100644 --- a/latest/_modules/tensorrt_llm/models/baichuan/model.html +++ b/latest/_modules/tensorrt_llm/models/baichuan/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -886,9 +888,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/bert/model.html b/latest/_modules/tensorrt_llm/models/bert/model.html index ff8bca7065..9c5e8ca7ef 100644 --- a/latest/_modules/tensorrt_llm/models/bert/model.html +++ b/latest/_modules/tensorrt_llm/models/bert/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1190,9 +1192,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/bloom/model.html b/latest/_modules/tensorrt_llm/models/bloom/model.html index bdfef77765..52a5820861 100644 --- a/latest/_modules/tensorrt_llm/models/bloom/model.html +++ b/latest/_modules/tensorrt_llm/models/bloom/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -798,9 +800,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/chatglm/config.html b/latest/_modules/tensorrt_llm/models/chatglm/config.html index 24e4fa539e..fa1b3c6424 100644 --- a/latest/_modules/tensorrt_llm/models/chatglm/config.html +++ b/latest/_modules/tensorrt_llm/models/chatglm/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -815,9 +817,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/chatglm/model.html b/latest/_modules/tensorrt_llm/models/chatglm/model.html index 3ad79c2e00..ac8294f98c 100644 --- a/latest/_modules/tensorrt_llm/models/chatglm/model.html +++ b/latest/_modules/tensorrt_llm/models/chatglm/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1014,9 +1016,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/clip/model.html b/latest/_modules/tensorrt_llm/models/clip/model.html index 3fa05c6d7a..054e22a83e 100644 --- a/latest/_modules/tensorrt_llm/models/clip/model.html +++ b/latest/_modules/tensorrt_llm/models/clip/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -843,9 +845,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/config.html b/latest/_modules/tensorrt_llm/models/cogvlm/config.html index 964bf3dc22..75a7de91e7 100644 --- a/latest/_modules/tensorrt_llm/models/cogvlm/config.html +++ b/latest/_modules/tensorrt_llm/models/cogvlm/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -674,9 +676,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/model.html b/latest/_modules/tensorrt_llm/models/cogvlm/model.html index 1a095e6404..4d2557aec9 100644 --- a/latest/_modules/tensorrt_llm/models/cogvlm/model.html +++ b/latest/_modules/tensorrt_llm/models/cogvlm/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -927,9 +929,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/commandr/model.html b/latest/_modules/tensorrt_llm/models/commandr/model.html index c005f54ad6..cb0c864cd7 100644 --- a/latest/_modules/tensorrt_llm/models/commandr/model.html +++ b/latest/_modules/tensorrt_llm/models/commandr/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -825,9 +827,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/dbrx/config.html b/latest/_modules/tensorrt_llm/models/dbrx/config.html index 33668b804a..9a7e917fea 100644 --- a/latest/_modules/tensorrt_llm/models/dbrx/config.html +++ b/latest/_modules/tensorrt_llm/models/dbrx/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -689,9 +691,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/dbrx/model.html b/latest/_modules/tensorrt_llm/models/dbrx/model.html index 1f7d28a8c7..c058b28af2 100644 --- a/latest/_modules/tensorrt_llm/models/dbrx/model.html +++ b/latest/_modules/tensorrt_llm/models/dbrx/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -815,9 +817,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html index 58859057d6..4ef0de9bfe 100644 --- a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html +++ b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -909,9 +911,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html index b8f43dd0ac..14b01a9e89 100644 --- a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html +++ b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -991,9 +993,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/dit/model.html b/latest/_modules/tensorrt_llm/models/dit/model.html index 79248a4f0f..3b1edabc97 100644 --- a/latest/_modules/tensorrt_llm/models/dit/model.html +++ b/latest/_modules/tensorrt_llm/models/dit/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1027,9 +1029,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/eagle/model.html b/latest/_modules/tensorrt_llm/models/eagle/model.html index 746b54b9c6..72268516ce 100644 --- a/latest/_modules/tensorrt_llm/models/eagle/model.html +++ b/latest/_modules/tensorrt_llm/models/eagle/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1963,9 +1965,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/enc_dec/model.html b/latest/_modules/tensorrt_llm/models/enc_dec/model.html index 29b872204a..5670bcb067 100644 --- a/latest/_modules/tensorrt_llm/models/enc_dec/model.html +++ b/latest/_modules/tensorrt_llm/models/enc_dec/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -2870,9 +2872,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/falcon/config.html b/latest/_modules/tensorrt_llm/models/falcon/config.html index 1544f9b733..a517fbaa87 100644 --- a/latest/_modules/tensorrt_llm/models/falcon/config.html +++ b/latest/_modules/tensorrt_llm/models/falcon/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -750,9 +752,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/falcon/model.html b/latest/_modules/tensorrt_llm/models/falcon/model.html index 2cb24e48cd..77a7150981 100644 --- a/latest/_modules/tensorrt_llm/models/falcon/model.html +++ b/latest/_modules/tensorrt_llm/models/falcon/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -912,9 +914,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gemma/config.html b/latest/_modules/tensorrt_llm/models/gemma/config.html index d887eb7302..7e7f0420b7 100644 --- a/latest/_modules/tensorrt_llm/models/gemma/config.html +++ b/latest/_modules/tensorrt_llm/models/gemma/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -840,9 +842,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gemma/model.html b/latest/_modules/tensorrt_llm/models/gemma/model.html index 38fa4c0da5..c0bcd22677 100644 --- a/latest/_modules/tensorrt_llm/models/gemma/model.html +++ b/latest/_modules/tensorrt_llm/models/gemma/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1035,9 +1037,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gpt/config.html b/latest/_modules/tensorrt_llm/models/gpt/config.html index b4564e9fea..408afb19c0 100644 --- a/latest/_modules/tensorrt_llm/models/gpt/config.html +++ b/latest/_modules/tensorrt_llm/models/gpt/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -959,9 +961,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gpt/model.html b/latest/_modules/tensorrt_llm/models/gpt/model.html index c7ccc4dcd4..80d5cb7e7b 100644 --- a/latest/_modules/tensorrt_llm/models/gpt/model.html +++ b/latest/_modules/tensorrt_llm/models/gpt/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1062,9 +1064,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gptj/config.html b/latest/_modules/tensorrt_llm/models/gptj/config.html index 3f391136e7..af1260074d 100644 --- a/latest/_modules/tensorrt_llm/models/gptj/config.html +++ b/latest/_modules/tensorrt_llm/models/gptj/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -688,9 +690,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gptj/model.html b/latest/_modules/tensorrt_llm/models/gptj/model.html index 1c37471537..23c9da2bfb 100644 --- a/latest/_modules/tensorrt_llm/models/gptj/model.html +++ b/latest/_modules/tensorrt_llm/models/gptj/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -838,9 +840,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/gptneox/model.html b/latest/_modules/tensorrt_llm/models/gptneox/model.html index 7f514b91c3..f10a22212d 100644 --- a/latest/_modules/tensorrt_llm/models/gptneox/model.html +++ b/latest/_modules/tensorrt_llm/models/gptneox/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -780,9 +782,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/llama/config.html b/latest/_modules/tensorrt_llm/models/llama/config.html index fdf8b75fd5..35794b9a22 100644 --- a/latest/_modules/tensorrt_llm/models/llama/config.html +++ b/latest/_modules/tensorrt_llm/models/llama/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -914,9 +916,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/llama/model.html b/latest/_modules/tensorrt_llm/models/llama/model.html index d09be90d7a..4960edfb55 100644 --- a/latest/_modules/tensorrt_llm/models/llama/model.html +++ b/latest/_modules/tensorrt_llm/models/llama/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1262,9 +1264,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/mamba/model.html b/latest/_modules/tensorrt_llm/models/mamba/model.html index d11931dd33..2e19b21a09 100644 --- a/latest/_modules/tensorrt_llm/models/mamba/model.html +++ b/latest/_modules/tensorrt_llm/models/mamba/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1107,9 +1109,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/medusa/config.html b/latest/_modules/tensorrt_llm/models/medusa/config.html index 5fcfb61ebc..8169b9e474 100644 --- a/latest/_modules/tensorrt_llm/models/medusa/config.html +++ b/latest/_modules/tensorrt_llm/models/medusa/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -747,9 +749,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/medusa/model.html b/latest/_modules/tensorrt_llm/models/medusa/model.html index e903cea5f6..160c44d25d 100644 --- a/latest/_modules/tensorrt_llm/models/medusa/model.html +++ b/latest/_modules/tensorrt_llm/models/medusa/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -897,9 +899,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/mllama/model.html b/latest/_modules/tensorrt_llm/models/mllama/model.html index 6be138b113..02d061302a 100644 --- a/latest/_modules/tensorrt_llm/models/mllama/model.html +++ b/latest/_modules/tensorrt_llm/models/mllama/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -2208,9 +2210,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html index b9d46b13f0..6ec32c5df2 100644 --- a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html +++ b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1274,9 +1276,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/modeling_utils.html b/latest/_modules/tensorrt_llm/models/modeling_utils.html index bd2432a93e..3812b60a07 100644 --- a/latest/_modules/tensorrt_llm/models/modeling_utils.html +++ b/latest/_modules/tensorrt_llm/models/modeling_utils.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -2683,9 +2685,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/mpt/model.html b/latest/_modules/tensorrt_llm/models/mpt/model.html index 8813fa46cb..459ef5830e 100644 --- a/latest/_modules/tensorrt_llm/models/mpt/model.html +++ b/latest/_modules/tensorrt_llm/models/mpt/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -812,9 +814,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html index ffda4025f2..155f15b151 100644 --- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html +++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -746,9 +748,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html index 1c19643337..abb527e04b 100644 --- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html +++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -814,9 +816,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/opt/model.html b/latest/_modules/tensorrt_llm/models/opt/model.html index e10798ca21..4e3f9e15a7 100644 --- a/latest/_modules/tensorrt_llm/models/opt/model.html +++ b/latest/_modules/tensorrt_llm/models/opt/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -817,9 +819,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/phi/model.html b/latest/_modules/tensorrt_llm/models/phi/model.html index deecc0188d..3ca7749044 100644 --- a/latest/_modules/tensorrt_llm/models/phi/model.html +++ b/latest/_modules/tensorrt_llm/models/phi/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -859,9 +861,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/phi3/model.html b/latest/_modules/tensorrt_llm/models/phi3/model.html index ce940e2d70..05e965e296 100644 --- a/latest/_modules/tensorrt_llm/models/phi3/model.html +++ b/latest/_modules/tensorrt_llm/models/phi3/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -955,9 +957,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html index cbe0030d9c..b71d6ca879 100644 --- a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html +++ b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1260,9 +1262,9 @@ diff --git a/latest/_modules/tensorrt_llm/models/redrafter/model.html b/latest/_modules/tensorrt_llm/models/redrafter/model.html index 6abe0d27cd..05ecd0e886 100644 --- a/latest/_modules/tensorrt_llm/models/redrafter/model.html +++ b/latest/_modules/tensorrt_llm/models/redrafter/model.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -947,9 +949,9 @@ diff --git a/latest/_modules/tensorrt_llm/plugin/plugin.html b/latest/_modules/tensorrt_llm/plugin/plugin.html index 25b371b9d8..6c5c97516b 100644 --- a/latest/_modules/tensorrt_llm/plugin/plugin.html +++ b/latest/_modules/tensorrt_llm/plugin/plugin.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1435,9 +1437,9 @@ diff --git a/latest/_modules/tensorrt_llm/quantization/mode.html b/latest/_modules/tensorrt_llm/quantization/mode.html index 35fc7812a6..8937e5cda1 100644 --- a/latest/_modules/tensorrt_llm/quantization/mode.html +++ b/latest/_modules/tensorrt_llm/quantization/mode.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1109,9 +1111,9 @@ diff --git a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html index cce8bf592a..63d236e56a 100644 --- a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html +++ b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1903,9 +1905,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html index 949b58db8e..81d68fc822 100644 --- a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html +++ b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1174,9 +1176,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/generation.html b/latest/_modules/tensorrt_llm/runtime/generation.html index 3d57374b31..d2e389435a 100644 --- a/latest/_modules/tensorrt_llm/runtime/generation.html +++ b/latest/_modules/tensorrt_llm/runtime/generation.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -5514,9 +5516,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html index 93d0048c58..3f91525255 100644 --- a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html +++ b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1118,9 +1120,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner.html b/latest/_modules/tensorrt_llm/runtime/model_runner.html index 77830c02ea..23889296f5 100644 --- a/latest/_modules/tensorrt_llm/runtime/model_runner.html +++ b/latest/_modules/tensorrt_llm/runtime/model_runner.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -991,6 +993,7 @@ prompt_table, torch.Tensor), "Prompt table should be str or torch.Tensor" prompt_table_data = prompt_table.to(dtype=self.dtype) + torch.cuda.current_stream().synchronize() return prompt_table_data @@ -1637,9 +1640,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html index 074e6b3161..ac227fda67 100644 --- a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html +++ b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1850,9 +1852,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html index 37ef76d322..e53e09fcc9 100644 --- a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html +++ b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -3432,9 +3434,9 @@ diff --git a/latest/_modules/tensorrt_llm/runtime/session.html b/latest/_modules/tensorrt_llm/runtime/session.html index aa650ba6b9..da040db4d2 100644 --- a/latest/_modules/tensorrt_llm/runtime/session.html +++ b/latest/_modules/tensorrt_llm/runtime/session.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -978,9 +980,9 @@ diff --git a/latest/_modules/tensorrt_llm/sampling_params.html b/latest/_modules/tensorrt_llm/sampling_params.html index 12e5d655ff..75447e5237 100644 --- a/latest/_modules/tensorrt_llm/sampling_params.html +++ b/latest/_modules/tensorrt_llm/sampling_params.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -862,9 +864,13 @@ [docs] @staticmethod def params_imply_greedy_decoding( - *, temperature: Optional[float], top_p: Optional[float], top_k: Optional[int] + *, + temperature: Optional[float], + top_p: Optional[float], + top_k: Optional[int], + use_beam_search: bool | None, ): - return ( + return (not use_beam_search) and ( (temperature is None and top_p is None and top_k is None) or top_k == 1 or top_p == 0.0 @@ -874,10 +880,11 @@ @property def _greedy_decoding(self) -> bool: - return not self.use_beam_search and self.params_imply_greedy_decoding( + return self.params_imply_greedy_decoding( temperature=self.temperature, top_p=self.top_p, top_k=self.top_k, + use_beam_search=self.use_beam_search, ) @property @@ -1192,9 +1199,9 @@ diff --git a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt index da72ee5464..ad0e9975a1 100644 --- a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt +++ b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt @@ -30,7 +30,7 @@ In this blog, we share the configurations and procedures about how to reproduce - [Expected Result Format](#expected-result-format-3) - [Exploring more ISL/OSL combinations](#exploring-more-islosl-combinations) - [WIP: Enable more features by default](#wip-enable-more-features-by-default) - - [Not supported: MLA chunked context support on Hopper](#not-supported-mla-chunked-context-support-on-hopper) + - [MLA chunked context](#mla-chunked-context) - [Out of memory issues](#out-of-memory-issues) @@ -69,8 +69,11 @@ For NVIDIA Hopper GPUs, it's recommended to use the FP8 version of the DeepSeek YOUR_MODEL_PATH= cd $YOUR_MODEL_PATH -## Download FP4 model for Blackwell GPUs -git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4 +## Download NVFP4 model for Blackwell GPUs +git clone https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2 + +## Or the 0528 version +git clone https://huggingface.co/nvidia/DeepSeek-R1-0528-NVFP4-v2 ## Download FP8 model for Hopper GPUs ## FP8 model also works for Blackwell, but FP4 has the best performance on Blackwell. @@ -248,13 +251,13 @@ To do the benchmark, run the following command: ```bash # generate synthetic dataset -python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \ - --stdout \ - --tokenizer nvidia/DeepSeek-R1-FP4 \ +trtllm-bench --model nvidia/DeepSeek-R1-FP4 \ + prepare-dataset \ + --output dataset.txt \ token-norm-dist \ --input-mean 1024 --output-mean 2048 \ --input-stdev 0 --output-stdev 0 \ - --num-requests 49152 > dataset.txt + --num-requests 49152 YOUR_DATA_PATH=./dataset.txt @@ -350,13 +353,14 @@ To do the benchmark, run the following command: ```bash # generate synthetic dataset -python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \ - --stdout \ - --tokenizer deepseek-ai/DeepSeek-R1 \ +trtllm-bench --model nvidia/DeepSeek-R1-FP4 \ + prepare-dataset \ + --output dataset.txt \ token-norm-dist \ --input-mean 1024 --output-mean 2048 \ --input-stdev 0 --output-stdev 0 \ - --num-requests 5120 > dataset.txt + --num-requests 5120 + YOUR_DATA_PATH=./dataset.txt cat >./extra-llm-api-config.yml<`_ for examples in the following sections. +After you start the server, you can send inference requests through completions API, Chat API and Responses API, which are compatible with corresponding OpenAI APIs. We use `TinyLlama-1.1B-Chat-v1.0 `_ for examples in the following sections. Chat API ~~~~~~~~ @@ -66,6 +66,24 @@ Another example uses ``curl``: :language: bash :linenos: +Responses API +~~~~~~~~~~~~~~~ + +You can query Responses API with any http clients, a typical example is OpenAI Python client: + +.. literalinclude:: ../../../../examples/serve/openai_responses_client.py + :language: python + :linenos: + +Another example uses ``curl``: + +.. literalinclude:: ../../../../examples/serve/curl_responses_client.sh + :language: bash + :linenos: + + +More openai compatible examples can be found in the `compatibility examples `_ directory. + Multimodal Serving ~~~~~~~~~~~~~~~~~~ diff --git a/latest/_sources/deployment-guide/config_table.rst.txt b/latest/_sources/deployment-guide/config_table.rst.txt new file mode 100644 index 0000000000..d28fed25a8 --- /dev/null +++ b/latest/_sources/deployment-guide/config_table.rst.txt @@ -0,0 +1,1074 @@ +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. start-deepseek-ai/DeepSeek-R1-0528 + +.. _deepseek-ai/DeepSeek-R1-0528: + +`DeepSeek-R1 `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :width: 100% + :header-rows: 1 + :widths: 12 15 15 13 20 25 + + * - GPU + - Performance Profile + - ISL / OSL + - Concurrency + - Config + - Command + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml`` + * - 8xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml`` + +.. end-deepseek-ai/DeepSeek-R1-0528 + +.. start-nvidia/DeepSeek-R1-0528-FP4-v2 + +.. _nvidia/DeepSeek-R1-0528-FP4-v2: + +`DeepSeek-R1 (NVFP4) `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :width: 100% + :header-rows: 1 + :widths: 12 15 15 13 20 25 + + * - GPU + - Performance Profile + - ISL / OSL + - Concurrency + - Config + - Command + * - 4xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp4_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + * - 4xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 128 + - `8k1k_tp4_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 128 + - `8k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml`` + +.. end-nvidia/DeepSeek-R1-0528-FP4-v2 + +.. start-openai/gpt-oss-120b + +.. _openai/gpt-oss-120b: + +`gpt-oss-120b `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :width: 100% + :header-rows: 1 + :widths: 12 15 15 13 20 25 + + * - GPU + - Performance Profile + - ISL / OSL + - Concurrency + - Config + - Command + * - B200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + * - B200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + * - B200_NVL + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + * - B200_NVL + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + * - B200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + * - B200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + * - B200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` + * - B200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` + * - 2xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` + * - B200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` + * - 2xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` + * - H200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` + * - H200_SXM + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` + * - H200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + * - 2xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + * - 8xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` + * - H200_SXM + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml`` + +.. end-openai/gpt-oss-120b diff --git a/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt index 55deeb94fe..a887ec24b9 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md.txt @@ -47,7 +47,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` @@ -66,7 +66,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -74,7 +74,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/deepseek-r1-throughput.yaml +```{literalinclude} ../../../examples/configs/curated/deepseek-r1-throughput.yaml --- language: shell prepend: | @@ -90,7 +90,7 @@ To use the `DeepGEMM` MOE backend on B200/GB200, use this config instead: ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -98,7 +98,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/deepseek-r1-deepgemm.yaml +```{literalinclude} ../../../examples/configs/curated/deepseek-r1-deepgemm.yaml --- language: shell prepend: | @@ -154,7 +154,7 @@ These options provide control over TensorRT LLM's behavior and are set within th #### `trust_remote_code` - **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. +* **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. #### `kv_cache_config` @@ -429,3 +429,23 @@ $$ $$ \text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} $$ + +## Preconfigured Recipes + +The following tables list recommended configurations from the comprehensive database for different performance profiles. + +```{eval-rst} +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. include:: config_table.rst + :start-after: .. start-deepseek-ai/DeepSeek-R1-0528 + :end-before: .. end-deepseek-ai/DeepSeek-R1-0528 +``` + +```{eval-rst} +.. include:: config_table.rst + :start-after: .. start-nvidia/DeepSeek-R1-0528-FP4-v2 + :end-before: .. end-nvidia/DeepSeek-R1-0528-FP4-v2 +``` diff --git a/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt index ae34c5b3ce..cc30f55e98 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md.txt @@ -43,7 +43,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` @@ -64,7 +64,7 @@ For low-latency use cases: ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -72,7 +72,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/gpt-oss-120b-latency.yaml +```{literalinclude} ../../../examples/configs/curated/gpt-oss-120b-latency.yaml --- language: shell prepend: | @@ -88,7 +88,7 @@ For max-throughput use cases: ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -96,7 +96,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/gpt-oss-120b-throughput.yaml +```{literalinclude} ../../../examples/configs/curated/gpt-oss-120b-throughput.yaml --- language: shell prepend: | @@ -377,3 +377,17 @@ $$ $$ \text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} $$ + +## Preconfigured Recipes + +The following table lists recommended configurations from the comprehensive database for different performance profiles. + +```{eval-rst} +.. include:: note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. include:: config_table.rst + :start-after: .. start-openai/gpt-oss-120b + :end-before: .. end-openai/gpt-oss-120b +``` diff --git a/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt index d8ec17daff..391a72091d 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md.txt @@ -306,3 +306,18 @@ Run `bench.sh` to begin a serving benchmark. ```shell ./bench.sh ``` + +## Troubleshooting + +Since Kimi K2 Thinking has larger weight size than other models, it's possible seeing host OOM issues, as the following: + +```log +Loading weights: 100%|█████████████████████| 1408/1408 [03:43<00:00, 6.30it/s] + 0: [12/04/2025-18:38:28] [TRT-LLM] [RANK 0] [I] moe_load_balancer finalizing model... + 1: [nvl72136-T14:452151:0:452151] Caught signal 7 (Bus error: nonexistent physical address) + 1: ==== backtrace (tid: 452151) ==== + 1: 0 /usr/local/ucx//lib/libucs.so.0(ucs_handle_error+0x2cc) [0xffff9638274c] + 1: 1 /usr/local/ucx//lib/libucs.so.0(+0x328fc) [0xffff963828fc] + 1: 2 /usr/local/ucx//lib/libucs.so.0(+0x32c78) [0xffff96382c78] +``` +This can be addressed by mounting `tmpfs:/dev/shm:size=640G` when launching the Docker container, to increase the shm size that the container can access. diff --git a/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt index d227b2f440..b45b7d2ffa 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md.txt @@ -39,7 +39,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` @@ -58,7 +58,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -66,7 +66,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/llama-3.3-70b.yaml +```{literalinclude} ../../../examples/configs/curated/llama-3.3-70b.yaml --- language: shell prepend: | diff --git a/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt index 509a5cf00f..3e70209b21 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md.txt @@ -38,7 +38,7 @@ docker run --rm -it \ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash ``` @@ -57,7 +57,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -65,7 +65,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/llama-4-scout.yaml +```{literalinclude} ../../../examples/configs/curated/llama-4-scout.yaml --- language: shell prepend: | diff --git a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt index 246fc74a56..46bf724b71 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md.txt @@ -35,7 +35,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3-next.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -43,7 +43,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/qwen3-next.yaml +```{literalinclude} ../../../examples/configs/curated/qwen3-next.yaml --- language: shell prepend: | diff --git a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt index 190740ebd8..894c6a1e63 100644 --- a/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt +++ b/latest/_sources/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md.txt @@ -40,7 +40,7 @@ We maintain YAML configuration files with recommended performance settings in th ```shell TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3.yaml +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml ``` Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below. @@ -48,7 +48,7 @@ Note: if you don't have access to the source code locally, you can manually crea ````{admonition} Show code :class: dropdown -```{literalinclude} ../../../examples/configs/qwen3.yaml +```{literalinclude} ../../../examples/configs/curated/qwen3.yaml --- language: shell prepend: | diff --git a/latest/_sources/deployment-guide/index.rst.txt b/latest/_sources/deployment-guide/index.rst.txt index ed7fd9c536..644a9d9ae9 100644 --- a/latest/_sources/deployment-guide/index.rst.txt +++ b/latest/_sources/deployment-guide/index.rst.txt @@ -6,15 +6,20 @@ Quick Start for Popular Models The table below contains ``trtllm-serve`` commands that can be used to easily deploy popular models including DeepSeek-R1, gpt-oss, Llama 4, Qwen3, and more. -We maintain LLM API configuration files for these models containing recommended performance settings in the `examples/configs `_ directory. The TensorRT LLM Docker container makes the config files available at ``/app/tensorrt_llm/examples/configs``, but you can customize this as needed: +We maintain LLM API configuration files for these models containing recommended performance settings in two locations: + +* **Curated Examples**: `examples/configs/curated `_ - Hand-picked configurations for common scenarios. +* **Comprehensive Database**: `examples/configs/database `_ - A more comprehensive set of known-good configurations for various GPUs and traffic patterns. + +The TensorRT LLM Docker container makes these config files available at ``/app/tensorrt_llm/examples/configs/curated`` and ``/app/tensorrt_llm/examples/configs/database`` respectively. You can reference them as needed: .. code-block:: bash export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment -.. note:: - - The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, you may benefit from additional tuning. In the future, we plan to provide more configs for a wider range of traffic patterns. +.. include:: note_sections.rst + :start-after: .. start-note-quick-start-isl-osl + :end-before: .. end-note-quick-start-isl-osl This table is designed to provide a straightforward starting point; for detailed model-specific deployment guides, check out the guides below. @@ -30,53 +35,53 @@ This table is designed to provide a straightforward starting point; for detailed * - `DeepSeek-R1 `_ - H100, H200 - Max Throughput - - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml`` + - `deepseek-r1-throughput.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 `_ - B200, GB200 - Max Throughput - - `deepseek-r1-deepgemm.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml`` + - `deepseek-r1-deepgemm.yaml `_ + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Max Throughput - - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml`` + - `deepseek-r1-throughput.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Min Latency - - `deepseek-r1-latency.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/deepseek-r1-latency.yaml`` + - `deepseek-r1-latency.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml`` * - `gpt-oss-120b `_ - Any - Max Throughput - - `gpt-oss-120b-throughput.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml`` + - `gpt-oss-120b-throughput.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml`` * - `gpt-oss-120b `_ - Any - Min Latency - - `gpt-oss-120b-latency.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml`` + - `gpt-oss-120b-latency.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml`` * - `Qwen3-Next-80B-A3B-Thinking `_ - Any - Max Throughput - - `qwen3-next.yaml `_ - - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/qwen3-next.yaml`` + - `qwen3-next.yaml `_ + - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml`` * - Qwen3 family (e.g. `Qwen3-30B-A3B `_) - Any - Max Throughput - - `qwen3.yaml `_ - - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/qwen3.yaml`` (swap to another Qwen3 model name as needed) + - `qwen3.yaml `_ + - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed) * - `Llama-3.3-70B (FP8) `_ - Any - Max Throughput - - `llama-3.3-70b.yaml `_ - - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml`` + - `llama-3.3-70b.yaml `_ + - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml`` * - `Llama 4 Scout (FP8) `_ - Any - Max Throughput - - `llama-4-scout.yaml `_ - - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml`` + - `llama-4-scout.yaml `_ + - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml`` Model-Specific Deployment Guides --------------------------------- @@ -94,3 +99,10 @@ The deployment guides below provide more detailed instructions for serving speci deployment-guide-for-qwen3-on-trtllm.md deployment-guide-for-qwen3-next-on-trtllm.md deployment-guide-for-kimi-k2-thinking-on-trtllm.md + +Comprehensive Configuration Database +------------------------------------ + +The table below lists all available pre-configured model scenarios in the TensorRT LLM configuration database. Each row represents a specific model, GPU, and performance profile combination with recommended request settings. + +.. include:: config_table.rst diff --git a/latest/_sources/deployment-guide/note_sections.rst.txt b/latest/_sources/deployment-guide/note_sections.rst.txt new file mode 100644 index 0000000000..4cd0d1c41d --- /dev/null +++ b/latest/_sources/deployment-guide/note_sections.rst.txt @@ -0,0 +1,36 @@ +.. + Reusable note sections for deployment guides. + Include specific notes using: + + .. include:: note_sections.rst + :start-after: .. start-note- + :end-before: .. end-note- + +.. start-note-traffic-patterns + +.. note:: + + **Traffic Patterns**: The ISL (Input Sequence Length) and OSL (Output Sequence Length) + values in each configuration represent the **maximum supported values** for that config. + Requests exceeding these limits may result in errors. + + To handle requests with input sequences **longer than the configured ISL**, add the following + to your config file: + + .. code-block:: yaml + + enable_chunked_prefill: true + + This enables chunked prefill, which processes long input sequences in chunks rather than + requiring them to fit within a single prefill operation. Note that enabling chunked prefill + does **not** guarantee optimal performance—these configs are tuned for the specified ISL/OSL. + +.. end-note-traffic-patterns + +.. start-note-quick-start-isl-osl + +.. note:: + + The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Comprehensive Configuration Database` section below which covers a larger set of traffic patterns and performance profiles. + +.. end-note-quick-start-isl-osl diff --git a/latest/_sources/developer-guide/perf-analysis.md.txt b/latest/_sources/developer-guide/perf-analysis.md.txt index 3ac01d82ed..4aa26ecbda 100644 --- a/latest/_sources/developer-guide/perf-analysis.md.txt +++ b/latest/_sources/developer-guide/perf-analysis.md.txt @@ -72,10 +72,12 @@ Say we want to profile iterations 100 to 150 on a `trtllm-bench`/`trtllm-serve` #!/bin/bash # Prepare dataset for the benchmark -python3 benchmarks/cpp/prepare_dataset.py \ - --tokenizer=${MODEL_PATH} \ - --stdout token-norm-dist --num-requests=${NUM_SAMPLES} \ - --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt +trtllm-bench --model ${MODEL_PATH} \ + prepare-dataset \ + --output dataset.txt \ + token-norm-dist \ + --num-requests=${NUM_SAMPLES} \ + --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 # Benchmark and profile TLLM_PROFILE_START_STOP=100-150 nsys profile \ diff --git a/latest/_sources/developer-guide/perf-benchmarking.md.txt b/latest/_sources/developer-guide/perf-benchmarking.md.txt index 4e4e3ca421..63bd9f6f8f 100644 --- a/latest/_sources/developer-guide/perf-benchmarking.md.txt +++ b/latest/_sources/developer-guide/perf-benchmarking.md.txt @@ -152,7 +152,7 @@ directory. For example, to generate a synthetic dataset of 1000 requests with a 128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run: ```shell -python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt +trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 ``` ### Running with the PyTorch Workflow @@ -233,13 +233,13 @@ The PyTorch workflow supports benchmarking with LoRA (Low-Rank Adaptation) adapt **Preparing LoRA Dataset** -Use `prepare_dataset.py` with LoRA-specific options to generate requests with LoRA metadata: +Use `trtllm-bench prepare-dataset` with LoRA-specific options to generate requests with LoRA metadata: ```shell -python3 benchmarks/cpp/prepare_dataset.py \ - --stdout \ +trtllm-bench \ + --model /path/to/tokenizer \ + prepare-dataset \ --rand-task-id 0 1 \ - --tokenizer /path/to/tokenizer \ --lora-dir /path/to/loras \ token-norm-dist \ --num-requests 100 \ @@ -310,17 +310,18 @@ Each subdirectory should contain the LoRA adapter files for that specific task. To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above. First, prepare the dataset: -```python -python ./benchmarks/cpp/prepare_dataset.py \ - --tokenizer Qwen/Qwen2-VL-2B-Instruct \ - --stdout \ - dataset \ +```bash +trtllm-bench \ + --model Qwen/Qwen2-VL-2B-Instruct \ + prepare-dataset \ + --output mm_data.jsonl + real-dataset --dataset-name lmms-lab/MMMU \ --dataset-split test \ --dataset-image-key image \ --dataset-prompt-key question \ --num-requests 10 \ - --output-len-dist 128,5 > mm_data.jsonl + --output-len-dist 128,5 ``` It will download the media files to `/tmp` directory and prepare the dataset with their paths. Note that the `prompt` fields are texts and not tokenized ids. This is due to the fact that the `prompt` and the media (image/video) are processed by a preprocessor for multimodal files. @@ -423,10 +424,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8) - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8) -To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html). +To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html). `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration -file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) +file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints above: diff --git a/latest/_sources/developer-guide/perf-overview.md.txt b/latest/_sources/developer-guide/perf-overview.md.txt index 0a144a58d4..aefa91fd43 100644 --- a/latest/_sources/developer-guide/perf-overview.md.txt +++ b/latest/_sources/developer-guide/perf-overview.md.txt @@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To The performance numbers below were collected using the steps described in this document. -Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). +Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). *(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:* diff --git a/latest/_sources/examples/curl_chat_client.rst.txt b/latest/_sources/examples/curl_chat_client.rst.txt index d3709ccd9c..f5a6ef236b 100644 --- a/latest/_sources/examples/curl_chat_client.rst.txt +++ b/latest/_sources/examples/curl_chat_client.rst.txt @@ -2,7 +2,7 @@ Curl Chat Client ================ Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_chat_client.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_chat_client.sh. .. literalinclude:: ../../../examples/serve/curl_chat_client.sh :lines: 1-11 diff --git a/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt b/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt index 73760884c2..17e6340f42 100644 --- a/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt +++ b/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt @@ -2,7 +2,7 @@ Curl Chat Client For Multimodal =============================== Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_chat_client_for_multimodal.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_chat_client_for_multimodal.sh. .. literalinclude:: ../../../examples/serve/curl_chat_client_for_multimodal.sh :lines: 1-88 diff --git a/latest/_sources/examples/curl_completion_client.rst.txt b/latest/_sources/examples/curl_completion_client.rst.txt index c2f4e9a14e..b4ef6aa5d3 100644 --- a/latest/_sources/examples/curl_completion_client.rst.txt +++ b/latest/_sources/examples/curl_completion_client.rst.txt @@ -2,7 +2,7 @@ Curl Completion Client ====================== Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_completion_client.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_completion_client.sh. .. literalinclude:: ../../../examples/serve/curl_completion_client.sh :lines: 1-10 diff --git a/latest/_sources/examples/curl_responses_client.rst.txt b/latest/_sources/examples/curl_responses_client.rst.txt new file mode 100644 index 0000000000..bcb3bcd62b --- /dev/null +++ b/latest/_sources/examples/curl_responses_client.rst.txt @@ -0,0 +1,10 @@ +Curl Responses Client +===================== +Refer to the `trtllm-serve documentation `_ for starting a server. + +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/curl_responses_client.sh. + +.. literalinclude:: ../../../examples/serve/curl_responses_client.sh + :lines: 1-9 + :language: bash + :linenos: diff --git a/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt b/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt index 4e0a039fe1..4121dcc52f 100644 --- a/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt +++ b/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt @@ -2,7 +2,7 @@ Deepseek R1 Reasoning Parser ============================ Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/deepseek_r1_reasoning_parser.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/deepseek_r1_reasoning_parser.sh. .. literalinclude:: ../../../examples/serve/deepseek_r1_reasoning_parser.sh :lines: 1-23 diff --git a/latest/_sources/examples/genai_perf_client.rst.txt b/latest/_sources/examples/genai_perf_client.rst.txt index 4f222352aa..9bb9012949 100644 --- a/latest/_sources/examples/genai_perf_client.rst.txt +++ b/latest/_sources/examples/genai_perf_client.rst.txt @@ -2,7 +2,7 @@ Genai Perf Client ================= Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/genai_perf_client.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/genai_perf_client.sh. .. literalinclude:: ../../../examples/serve/genai_perf_client.sh :lines: 1-16 diff --git a/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt b/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt index 6ae821dace..aa6f66eace 100644 --- a/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt +++ b/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt @@ -2,7 +2,7 @@ Genai Perf Client For Multimodal ================================ Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/genai_perf_client_for_multimodal.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/genai_perf_client_for_multimodal.sh. .. literalinclude:: ../../../examples/serve/genai_perf_client_for_multimodal.sh :lines: 1-19 diff --git a/latest/_sources/examples/llm_guided_decoding.rst.txt b/latest/_sources/examples/llm_guided_decoding.rst.txt index c7a50512da..c1c9622871 100644 --- a/latest/_sources/examples/llm_guided_decoding.rst.txt +++ b/latest/_sources/examples/llm_guided_decoding.rst.txt @@ -1,6 +1,6 @@ Generate text with guided decoding ================================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_guided_decoding.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_guided_decoding.py. .. literalinclude:: ../../../examples/llm-api/llm_guided_decoding.py :lines: 4-47 diff --git a/latest/_sources/examples/llm_inference.rst.txt b/latest/_sources/examples/llm_inference.rst.txt index be80e456eb..a0379d8bf0 100644 --- a/latest/_sources/examples/llm_inference.rst.txt +++ b/latest/_sources/examples/llm_inference.rst.txt @@ -1,6 +1,6 @@ Generate text ============= -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference.py. .. literalinclude:: ../../../examples/llm-api/llm_inference.py :lines: 4-35 diff --git a/latest/_sources/examples/llm_inference_async.rst.txt b/latest/_sources/examples/llm_inference_async.rst.txt index f7ff40a646..3da36720c2 100644 --- a/latest/_sources/examples/llm_inference_async.rst.txt +++ b/latest/_sources/examples/llm_inference_async.rst.txt @@ -1,6 +1,6 @@ Generate text asynchronously ============================ -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference_async.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference_async.py. .. literalinclude:: ../../../examples/llm-api/llm_inference_async.py :lines: 4-43 diff --git a/latest/_sources/examples/llm_inference_async_streaming.rst.txt b/latest/_sources/examples/llm_inference_async_streaming.rst.txt index 0736586f2f..5d4711e145 100644 --- a/latest/_sources/examples/llm_inference_async_streaming.rst.txt +++ b/latest/_sources/examples/llm_inference_async_streaming.rst.txt @@ -1,6 +1,6 @@ Generate text in streaming ========================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference_async_streaming.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference_async_streaming.py. .. literalinclude:: ../../../examples/llm-api/llm_inference_async_streaming.py :lines: 4-64 diff --git a/latest/_sources/examples/llm_inference_distributed.rst.txt b/latest/_sources/examples/llm_inference_distributed.rst.txt index a04aa99313..07cc8963df 100644 --- a/latest/_sources/examples/llm_inference_distributed.rst.txt +++ b/latest/_sources/examples/llm_inference_distributed.rst.txt @@ -1,6 +1,6 @@ Distributed LLM Generation ========================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference_distributed.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_inference_distributed.py. .. literalinclude:: ../../../examples/llm-api/llm_inference_distributed.py :lines: 4-44 diff --git a/latest/_sources/examples/llm_kv_cache_connector.rst.txt b/latest/_sources/examples/llm_kv_cache_connector.rst.txt index 0a150c4a36..32b443ae33 100644 --- a/latest/_sources/examples/llm_kv_cache_connector.rst.txt +++ b/latest/_sources/examples/llm_kv_cache_connector.rst.txt @@ -1,6 +1,6 @@ KV Cache Connector ================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_kv_cache_connector.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_kv_cache_connector.py. .. literalinclude:: ../../../examples/llm-api/llm_kv_cache_connector.py :lines: 4-326 diff --git a/latest/_sources/examples/llm_kv_cache_offloading.rst.txt b/latest/_sources/examples/llm_kv_cache_offloading.rst.txt index a64445a962..5ae7bb74b1 100644 --- a/latest/_sources/examples/llm_kv_cache_offloading.rst.txt +++ b/latest/_sources/examples/llm_kv_cache_offloading.rst.txt @@ -1,6 +1,6 @@ KV Cache Offloading =================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_kv_cache_offloading.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_kv_cache_offloading.py. .. literalinclude:: ../../../examples/llm-api/llm_kv_cache_offloading.py :lines: 4-134 diff --git a/latest/_sources/examples/llm_logits_processor.rst.txt b/latest/_sources/examples/llm_logits_processor.rst.txt index b739b44ca9..e2c401f98b 100644 --- a/latest/_sources/examples/llm_logits_processor.rst.txt +++ b/latest/_sources/examples/llm_logits_processor.rst.txt @@ -1,6 +1,6 @@ Control generated text using logits processor ============================================= -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_logits_processor.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_logits_processor.py. .. literalinclude:: ../../../examples/llm-api/llm_logits_processor.py :lines: 4-128 diff --git a/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt b/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt index 0a84a19a28..fbaaae9489 100644 --- a/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt +++ b/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt @@ -1,6 +1,6 @@ Run LLM-API with pytorch backend on Slurm ========================================= -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_mgmn_llm_distributed.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_mgmn_llm_distributed.sh. .. literalinclude:: ../../../examples/llm-api/llm_mgmn_llm_distributed.sh :lines: 1-48,52-94 diff --git a/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt b/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt index ddfa9f47ca..bb9f5bfdb7 100644 --- a/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt +++ b/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt @@ -1,8 +1,8 @@ Run trtllm-bench with pytorch backend on Slurm ============================================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_mgmn_trtllm_bench.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_mgmn_trtllm_bench.sh. .. literalinclude:: ../../../examples/llm-api/llm_mgmn_trtllm_bench.sh - :lines: 1-46,50-131 + :lines: 1-46,50-130 :language: bash :linenos: diff --git a/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt b/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt index 18e6c10c8c..d3ebb95460 100644 --- a/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt +++ b/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt @@ -1,6 +1,6 @@ Run trtllm-serve with pytorch backend on Slurm ============================================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_mgmn_trtllm_serve.sh. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_mgmn_trtllm_serve.sh. .. literalinclude:: ../../../examples/llm-api/llm_mgmn_trtllm_serve.sh :lines: 1-46,50-92 diff --git a/latest/_sources/examples/llm_multilora.rst.txt b/latest/_sources/examples/llm_multilora.rst.txt index b0f9fdf5ec..5a4ef4786d 100644 --- a/latest/_sources/examples/llm_multilora.rst.txt +++ b/latest/_sources/examples/llm_multilora.rst.txt @@ -1,6 +1,6 @@ Generate text with multiple LoRA adapters ========================================= -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_multilora.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_multilora.py. .. literalinclude:: ../../../examples/llm-api/llm_multilora.py :lines: 4-89 diff --git a/latest/_sources/examples/llm_runtime.rst.txt b/latest/_sources/examples/llm_runtime.rst.txt index c7405bcbe5..b5c67ea9d7 100644 --- a/latest/_sources/examples/llm_runtime.rst.txt +++ b/latest/_sources/examples/llm_runtime.rst.txt @@ -1,6 +1,6 @@ Runtime Configuration Examples ============================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_runtime.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_runtime.py. .. literalinclude:: ../../../examples/llm-api/llm_runtime.py :lines: 4-144 diff --git a/latest/_sources/examples/llm_sampling.rst.txt b/latest/_sources/examples/llm_sampling.rst.txt index bc4c60a7ce..050450c330 100644 --- a/latest/_sources/examples/llm_sampling.rst.txt +++ b/latest/_sources/examples/llm_sampling.rst.txt @@ -1,6 +1,6 @@ Sampling Techniques Showcase ============================ -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_sampling.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_sampling.py. .. literalinclude:: ../../../examples/llm-api/llm_sampling.py :lines: 4-248 diff --git a/latest/_sources/examples/llm_sparse_attention.rst.txt b/latest/_sources/examples/llm_sparse_attention.rst.txt index 1c398bb1f0..c13f175d1e 100644 --- a/latest/_sources/examples/llm_sparse_attention.rst.txt +++ b/latest/_sources/examples/llm_sparse_attention.rst.txt @@ -1,6 +1,6 @@ Sparse Attention ================ -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_sparse_attention.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_sparse_attention.py. .. literalinclude:: ../../../examples/llm-api/llm_sparse_attention.py :lines: 4-229 diff --git a/latest/_sources/examples/llm_speculative_decoding.rst.txt b/latest/_sources/examples/llm_speculative_decoding.rst.txt index 689d6af530..dbfca2fb58 100644 --- a/latest/_sources/examples/llm_speculative_decoding.rst.txt +++ b/latest/_sources/examples/llm_speculative_decoding.rst.txt @@ -1,6 +1,6 @@ Speculative Decoding ==================== -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_speculative_decoding.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/llm-api/llm_speculative_decoding.py. .. literalinclude:: ../../../examples/llm-api/llm_speculative_decoding.py :lines: 4-95 diff --git a/latest/_sources/examples/openai_chat_client.rst.txt b/latest/_sources/examples/openai_chat_client.rst.txt index 29cf974ab0..bc25fbfefb 100644 --- a/latest/_sources/examples/openai_chat_client.rst.txt +++ b/latest/_sources/examples/openai_chat_client.rst.txt @@ -2,7 +2,7 @@ OpenAI Chat Client ================== Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_chat_client.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_chat_client.py. .. literalinclude:: ../../../examples/serve/openai_chat_client.py :lines: 2-21 diff --git a/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt b/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt index b3fb0a07bc..9eb49504d9 100644 --- a/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt +++ b/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt @@ -2,7 +2,7 @@ OpenAI Chat Client for Multimodal ================================= Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_chat_client_for_multimodal.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_chat_client_for_multimodal.py. .. literalinclude:: ../../../examples/serve/openai_chat_client_for_multimodal.py :lines: 2-129 diff --git a/latest/_sources/examples/openai_completion_client.rst.txt b/latest/_sources/examples/openai_completion_client.rst.txt index 7b60afc04d..54a9fac182 100644 --- a/latest/_sources/examples/openai_completion_client.rst.txt +++ b/latest/_sources/examples/openai_completion_client.rst.txt @@ -2,7 +2,7 @@ OpenAI Completion Client ======================== Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_completion_client.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_completion_client.py. .. literalinclude:: ../../../examples/serve/openai_completion_client.py :lines: 2-15 diff --git a/latest/_sources/examples/openai_completion_client_for_lora.rst.txt b/latest/_sources/examples/openai_completion_client_for_lora.rst.txt index 4eabf04fea..121ff107e2 100644 --- a/latest/_sources/examples/openai_completion_client_for_lora.rst.txt +++ b/latest/_sources/examples/openai_completion_client_for_lora.rst.txt @@ -2,7 +2,7 @@ Openai Completion Client For Lora ================================= Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_completion_client_for_lora.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_completion_client_for_lora.py. .. literalinclude:: ../../../examples/serve/openai_completion_client_for_lora.py :lines: 1-30 diff --git a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt index 8ed397f1cd..1eee39507d 100644 --- a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt +++ b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt @@ -2,7 +2,7 @@ OpenAI Completion Client with JSON Schema ========================================= Refer to the `trtllm-serve documentation `_ for starting a server. -Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/openai_completion_client_json_schema.py. +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_completion_client_json_schema.py. .. literalinclude:: ../../../examples/serve/openai_completion_client_json_schema.py :lines: 2-52 diff --git a/latest/_sources/examples/openai_responses_client.rst.txt b/latest/_sources/examples/openai_responses_client.rst.txt new file mode 100644 index 0000000000..f8b4c62bc5 --- /dev/null +++ b/latest/_sources/examples/openai_responses_client.rst.txt @@ -0,0 +1,10 @@ +OpenAI Responses Client +======================= +Refer to the `trtllm-serve documentation `_ for starting a server. + +Source https://github.com/NVIDIA/TensorRT-LLM/blob/9ba14263db0045ed3fa0860f949b5ce320107eb3/examples/serve/openai_responses_client.py. + +.. literalinclude:: ../../../examples/serve/openai_responses_client.py + :lines: 2-15 + :language: python + :linenos: diff --git a/latest/_sources/examples/trtllm_serve_examples.rst.txt b/latest/_sources/examples/trtllm_serve_examples.rst.txt index f39dfcee67..e61fd0e9ff 100644 --- a/latest/_sources/examples/trtllm_serve_examples.rst.txt +++ b/latest/_sources/examples/trtllm_serve_examples.rst.txt @@ -10,6 +10,7 @@ Online Serving Examples curl_chat_client curl_chat_client_for_multimodal curl_completion_client + curl_responses_client deepseek_r1_reasoning_parser genai_perf_client genai_perf_client_for_multimodal @@ -18,4 +19,5 @@ Online Serving Examples openai_completion_client openai_completion_client_for_lora openai_completion_client_json_schema + openai_responses_client diff --git a/latest/_sources/features/auto_deploy/support_matrix.md.txt b/latest/_sources/features/auto_deploy/support_matrix.md.txt index 26c07b308b..fec6d841af 100644 --- a/latest/_sources/features/auto_deploy/support_matrix.md.txt +++ b/latest/_sources/features/auto_deploy/support_matrix.md.txt @@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations: ### Precision Support -AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer). **Supported precision types include:** diff --git a/latest/_sources/features/quantization.md.txt b/latest/_sources/features/quantization.md.txt index 8a0e160529..7998f1c03a 100644 --- a/latest/_sources/features/quantization.md.txt +++ b/latest/_sources/features/quantization.md.txt @@ -11,6 +11,7 @@ TensorRT LLM offers a variety of quantization recipes to optimize LLM inference. * FP8 Block Scaling * FP8 Rowwise * FP8 KV Cache +* NVFP4 KV Cache * W4A16 GPTQ * W4A8 GPTQ * W4A16 AWQ @@ -23,7 +24,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac ### Running Pre-quantized Models -TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). ```python from tensorrt_llm import LLM @@ -47,6 +48,20 @@ llm = LLM(model='/path/to/model', llm.generate("Hello, my name is") ``` +#### NVFP4 KV Cache + +To enable NVFP4 KV cache, offline quantization with ModelOpt is required. Please follow the below section for instructions. +After the quantization is done, the NVFP4 KV cache option can be set by: + +```python +from tensorrt_llm import LLM +from tensorrt_llm.llmapi import KvCacheConfig +llm = LLM(model='/path/to/model', + kv_cache_config=KvCacheConfig(dtype='nvfp4')) +llm.generate("Hello, my name is") +``` + + ### Offline Quantization with ModelOpt If a pre-quantized model is not available on the [Hugging Face Hub](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4), you can quantize it offline using ModelOpt. @@ -54,35 +69,47 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg Follow this step-by-step guide to quantize a model: ```bash -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer/examples/llm_ptq -scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd Model-Optimizer/examples/llm_ptq +scripts/huggingface_example.sh --model --quant fp8 ``` +#### NVFP4 KV Cache + +To generate the checkpoint for NVFP4 KV cache: + +```bash +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd TensorRT-Model-Optimizer/examples/llm_ptq +scripts/huggingface_example.sh --model --quant fp8 --kv_cache_quant nvfp4 +``` + +Note that currently TRT-LLM only supports FP8 weight/activation quantization when NVFP4 KV cache is enabled. Therefore, `--quant fp8` is required here. + ## Model Supported Matrix -| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | -| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: | -| BERT | . | . | . | . | . | Y | . | . | . | . | -| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . | -| EXAONE | . | . | Y | . | . | Y | Y | Y | . | . | -| Gemma 3 | . | . | Y | . | . | Y | Y | Y | . | . | -| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . | -| LLaMA | Y | . | Y | . | . | Y | . | Y | . | Y | -| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | . | Y | -| LLaMA 3 | . | . | . | . | Y | Y | Y | . | . | . | -| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . | -| Mistral | . | . | Y | . | . | Y | . | Y | . | . | -| Mixtral | Y | . | Y | . | . | Y | . | . | . | . | -| Phi | . | . | . | . | . | Y | Y | . | . | . | -| Qwen | . | . | . | . | . | Y | Y | Y | . | Y | -| Qwen-2/2.5 | Y | . | Y | . | . | Y | Y | Y | . | Y | -| Qwen-3 | Y | . | Y | . | . | Y | . | Y | . | Y | -| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . | -| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . | -| LLaVA | . | . | Y | . | . | Y | . | Y | . | Y | -| VILA | . | . | Y | . | . | Y | . | Y | . | Y | -| Nougat | . | . | . | . | . | Y | . | . | . | . | +| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache | NVFP4 KV Cache | W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | +| :------------- | :---: | :---: | :---: | :---: | :---: | :---: |:---:| :-------: | :-------: | :--------: | :--------: | +| BERT | . | . | . | . | . | Y | . | . | . | . | . | +| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . | . | +| EXAONE | . | . | Y | . | . | Y | . | Y | Y | . | . | +| Gemma 3 | . | . | Y | . | . | Y | . | Y | Y | . | . | +| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . | . | +| LLaMA | Y | . | Y | . | . | Y | . | . | Y | . | Y | +| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | Y | . | Y | +| LLaMA 3 | . | . | . | . | Y | Y | Y | Y | . | . | . | +| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . | . | +| Mistral | . | . | Y | . | . | Y | . | . | Y | . | . | +| Mixtral | Y | . | Y | . | . | Y | . | . | . | . | . | +| Phi | . | . | . | . | . | Y | . | Y | . | . | . | +| Qwen | . | . | . | . | . | Y | . | Y | Y | . | Y | +| Qwen-2/2.5 | Y | . | Y | . | . | Y | . | Y | Y | . | Y | +| Qwen-3 | Y | . | Y | . | . | Y | Y | . | Y | . | Y | +| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . | . | +| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . | . | +| LLaVA | . | . | Y | . | . | Y | . | . | Y | . | Y | +| VILA | . | . | Y | . | . | Y | . | . | Y | . | Y | +| Nougat | . | . | . | . | . | Y | . | . | . | . | . | ```{note} @@ -93,13 +120,13 @@ The language component decides which quantization methods are supported by a giv ## Hardware Support Matrix -| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | -| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: | -| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . | -| Blackwell(sm100) | Y | Y | Y | Y | . | Y | . | . | . | . | -| Hopper | . | . | Y | Y | Y | Y | Y | Y | Y | Y | -| Ada Lovelace | . | . | Y | . | . | Y | Y | Y | Y | Y | -| Ampere | . | . | . | . | . | Y | . | Y | . | Y | +| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache | NVFP4 KV Cache | W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ | +| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: | +| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . | . | +| Blackwell(sm100) | Y | Y | Y | Y | . | Y | Y | . | . | . | . | +| Hopper | . | . | Y | Y | Y | Y | . | Y | Y | Y | Y | +| Ada Lovelace | . | . | Y | . | . | Y | . | Y | Y | Y | Y | +| Ampere | . | . | . | . | . | Y | . | . | Y | . | Y | ```{note} FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/weight and UE8M0 act/weight scale), which is slightly different from SM90 FP8 recipe (E4M3 act/weight and FP32 act/weight scale). ``` @@ -108,4 +135,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w ## Quick Links - [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) -- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html) +- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html) diff --git a/latest/_sources/legacy/performance/perf-analysis.md.txt b/latest/_sources/legacy/performance/perf-analysis.md.txt index f72437f4e9..51abd6460d 100644 --- a/latest/_sources/legacy/performance/perf-analysis.md.txt +++ b/latest/_sources/legacy/performance/perf-analysis.md.txt @@ -66,10 +66,10 @@ Say we want to profile iterations 100 to 150 on a trtllm-bench/trtllm-serve run, #!/bin/bash # Prepare dataset for the benchmark -python3 benchmarks/cpp/prepare_dataset.py \ - --tokenizer=${MODEL_PATH} \ - --stdout token-norm-dist --num-requests=${NUM_SAMPLES} \ - --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt +trtllm-bench \ + --model=${MODEL_PATH} prepare-dataset \ + --output /tmp/dataset.txt token-norm-dist --num-requests=${NUM_SAMPLES} \ + --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 # Benchmark and profile TLLM_PROFILE_START_STOP=100-150 nsys profile \ diff --git a/latest/_sources/legacy/performance/perf-benchmarking.md.txt b/latest/_sources/legacy/performance/perf-benchmarking.md.txt index 55caef07ba..9530b6da1b 100644 --- a/latest/_sources/legacy/performance/perf-benchmarking.md.txt +++ b/latest/_sources/legacy/performance/perf-benchmarking.md.txt @@ -110,7 +110,7 @@ of 128:128. To run the benchmark from start to finish, run the following commands: ```shell -python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt +trtllm-bench --tokenizer meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 trtllm-bench --model meta-llama/Llama-3.1-8B build --dataset /tmp/synthetic_128_128.txt --quantization FP8 trtllm-bench --model meta-llama/Llama-3.1-8B throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-3.1-8B/tp_1_pp_1 ``` @@ -207,7 +207,7 @@ directory. For example, to generate a synthetic dataset of 1000 requests with a 128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run: ```shell -benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt +trtllm-bench --tokenizer meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 ``` ### Building a Benchmark Engine @@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8) `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration -file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) +file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints above: diff --git a/latest/_sources/legacy/reference/support-matrix.md.txt b/latest/_sources/legacy/reference/support-matrix.md.txt index 1dc59fcfa0..24a3a01512 100644 --- a/latest/_sources/legacy/reference/support-matrix.md.txt +++ b/latest/_sources/legacy/reference/support-matrix.md.txt @@ -133,6 +133,7 @@ In addition, older architectures can have limitations for newer software release * - GPU Model Architectures - - [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/) + - [NVIDIA GB300 NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72/) - [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/) - [NVIDIA Grace Hopper Superchip](https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/) - [NVIDIA Hopper Architecture](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/) diff --git a/latest/_sources/llm-api/reference.rst.txt b/latest/_sources/llm-api/reference.rst.txt index 76a2c9f0e2..8816f4ccc3 100644 --- a/latest/_sources/llm-api/reference.rst.txt +++ b/latest/_sources/llm-api/reference.rst.txt @@ -17,6 +17,14 @@ API Reference :member-order: groupwise :inherited-members: +.. autoclass:: tensorrt_llm.llmapi.AsyncLLM + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ + :member-order: groupwise + :inherited-members: + .. autoclass:: tensorrt_llm.llmapi.MultimodalEncoder :members: :undoc-members: @@ -288,7 +296,7 @@ API Reference :special-members: __init__ :member-order: groupwise :inherited-members: - :exclude-members: model_parametrized_name,update_forward_refs,model_rebuild,parse_raw,from_orm,model_validate_strings,model_computed_fields,validate,model_post_init,model_copy,dict,schema,parse_obj,json,model_validate_json,copy,model_config,model_dump_json,model_fields,schema_json,construct,model_extra,model_json_schema,model_validate,model_dump,parse_file,model_fields_set,model_construct + :exclude-members: model_rebuild,model_fields_set,parse_obj,model_post_init,model_fields,validate,from_orm,update_forward_refs,model_dump_json,model_dump,parse_file,model_json_schema,model_parametrized_name,json,model_validate,model_config,model_copy,model_construct,parse_raw,model_validate_json,dict,construct,schema,copy,model_validate_strings,model_computed_fields,model_extra,schema_json .. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs :members: @@ -297,7 +305,7 @@ API Reference :special-members: __init__ :member-order: groupwise :inherited-members: - :exclude-members: model_parametrized_name,update_forward_refs,model_rebuild,parse_raw,from_orm,model_validate_strings,model_computed_fields,validate,model_post_init,model_copy,dict,schema,parse_obj,json,model_validate_json,copy,model_config,model_dump_json,model_fields,schema_json,construct,model_extra,model_json_schema,model_validate,model_dump,parse_file,model_fields_set,model_construct + :exclude-members: model_rebuild,model_fields_set,parse_obj,model_post_init,model_fields,validate,from_orm,update_forward_refs,model_dump_json,model_dump,parse_file,model_json_schema,model_parametrized_name,json,model_validate,model_config,model_copy,model_construct,parse_raw,model_validate_json,dict,construct,schema,copy,model_validate_strings,model_computed_fields,model_extra,schema_json .. autoclass:: tensorrt_llm.llmapi.AutoDecodingConfig :members: diff --git a/latest/_sources/models/supported-models.md.txt b/latest/_sources/models/supported-models.md.txt index c6b6194b5d..40f3840073 100644 --- a/latest/_sources/models/supported-models.md.txt +++ b/latest/_sources/models/supported-models.md.txt @@ -8,6 +8,7 @@ The following is a table of supported models for the PyTorch backend: | `BertForSequenceClassification` | BERT-based | `textattack/bert-base-uncased-yelp-polarity` | | `DeciLMForCausalLM` | Nemotron | `nvidia/Llama-3_1-Nemotron-51B-Instruct` | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3` | +| `DeepseekV32ForCausalLM` | DeepSeek-V3.2 | `deepseek-ai/DeepSeek-V3.2` | | `Exaone4ForCausalLM` | EXAONE 4.0 | `LGAI-EXAONE/EXAONE-4.0-32B` | | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it` | | `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b` | @@ -34,6 +35,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl | Model Architecture/Feature | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Sliding Window Attention | Logits Post Processor | Guided Decoding | | ------------------------------ | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | --- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ------------------------ | --------------------- | --------------- | | `DeepseekV3ForCausalLM` | Yes | Yes | Yes | Yes | Yes [^1] | Yes | No | No | Yes | Yes | Yes [^2] | N/A | Yes | Yes | +| `DeepseekV32ForCausalLM` | Yes | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | N/A | Yes | Yes | | `Qwen3MoeForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | N/A | Yes | Yes | | `Qwen3NextForCausalLM` | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested | | `Llama4ForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Untested | N/A | Yes | Yes | diff --git a/latest/_sources/overview.md.txt b/latest/_sources/overview.md.txt index 0df4f72539..471e57ff23 100644 --- a/latest/_sources/overview.md.txt +++ b/latest/_sources/overview.md.txt @@ -4,7 +4,7 @@ ## About TensorRT LLM -[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs. +[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs. ## Key Capabilities @@ -40,7 +40,7 @@ TensorRT LLM strives to support the most popular models on **Day 0**. ### 🚀 **Advanced Optimization & Production Features** - **[In-Flight Batching & Paged Attention](./features/paged-attention-ifb-scheduler.md)**: In-flight batching eliminates wait times by dynamically managing request execution, processing context and generation phases together for maximum GPU utilization and reduced latency. - **[Multi-GPU Multi-Node Inference](./features/parallel-strategy.md)**: Seamless distributed inference with tensor, pipeline, and expert parallelism across multiple GPUs and nodes through the Model Definition API. -- **[Advanced Quantization](./features/quantization.md)**: +- **[Advanced Quantization](./features/quantization.md)**: - **FP4 Quantization**: Native support on NVIDIA B200 GPUs with optimized FP4 kernels - **FP8 Quantization**: Automatic conversion on NVIDIA H100 GPUs leveraging Hopper architecture - **[Speculative Decoding](./features/speculative-decoding.md)**: Multiple algorithms including EAGLE, MTP and NGram @@ -54,7 +54,7 @@ TensorRT LLM strives to support the most popular models on **Day 0**. ### 🔧 **Latest GPU Architecture Support** TensorRT LLM supports the full spectrum of NVIDIA GPU architectures: -- **NVIDIA Blackwell**: B200, GB200, RTX Pro 6000 SE with FP4 optimization +- **NVIDIA Blackwell**: B200, GB200, B300, GB300, and RTX Pro 6000 SE with FP4 optimization - **NVIDIA Hopper**: H100, H200,GH200 with FP8 acceleration - **NVIDIA Ada Lovelace**: L40/L40S, RTX 40 series with FP8 acceleration - **NVIDIA Ampere**: A100, RTX 30 series for production workloads diff --git a/latest/_sources/quick-start-guide.md.txt b/latest/_sources/quick-start-guide.md.txt index 088f70b3ea..6eff451feb 100644 --- a/latest/_sources/quick-start-guide.md.txt +++ b/latest/_sources/quick-start-guide.md.txt @@ -10,7 +10,7 @@ This is the starting point to try out TensorRT LLM. Specifically, this Quick Sta The [TensorRT LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) maintained by NVIDIA contains all of the required dependencies pre-installed. You can start the container on a machine with NVIDIA GPUs via: ```bash -docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 +docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 ``` diff --git a/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt b/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt index 4df92f0cf7..cf4c2c94dd 100644 --- a/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt +++ b/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt @@ -190,6 +190,25 @@ Specifies which sharding dimensions to apply during heuristic sharding. The avai You can enable multiple dimensions simultaneously. For example, `['tp', 'ep']` will apply both tensor parallelism and expert parallelism. +#### `process_grid` (dict, default: `None`) + +Specifies a 2D device mesh for hybrid EP+TP parallelism. + +- NOTE 1: This grid applies only to the MoE layers. Attention, Mamba, and MLP layers are unaffected. +- NOTE 2: The order of the keys matters. Process grid's layout is in the generalized column-major order, + that is, the last dimension is stride-one. +- NOTE 3: `ep * tp` must be equal to the provided world size. Otherwise, the mesh will be considered invalid, + and 1D ep-only parallelism will be applied. + +Example: + +``` + process_grid: {'ep': 2, 'tp': 2} +``` + +If `world_size == 4`, ranks \[0,1\] and \[2,3\] will create two EP groups. Experts will be distributed across these two +groups, and internally, TP=2 column-row sharding will be applied. + #### `requires_shape_prop` (bool, default: `true`) Whether shape propagation is required before applying this transform. Shape propagation enables the transform to make informed decisions about sharding strategies based on tensor dimensions. diff --git a/latest/_sources/torch/auto_deploy/support_matrix.md.txt b/latest/_sources/torch/auto_deploy/support_matrix.md.txt index c8780cbca1..f0158253dd 100644 --- a/latest/_sources/torch/auto_deploy/support_matrix.md.txt +++ b/latest/_sources/torch/auto_deploy/support_matrix.md.txt @@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations: ### Precision Support -AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer). **Supported precision types include:** diff --git a/latest/_sources/torch/features/quantization.md.txt b/latest/_sources/torch/features/quantization.md.txt index a2b6c48be2..47cc745165 100644 --- a/latest/_sources/torch/features/quantization.md.txt +++ b/latest/_sources/torch/features/quantization.md.txt @@ -1,7 +1,7 @@ # Quantization The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub, -which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). ```python from tensorrt_llm._torch import LLM @@ -12,7 +12,7 @@ llm.generate("Hello, my name is") Or you can try the following commands to get a quantized model by yourself: ```bash -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer/examples/llm_ptq +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd Model-Optimizer/examples/llm_ptq scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf ``` diff --git a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html index 25eacbb65f..0519f15432 100644 --- a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html +++ b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -573,7 +575,7 @@
  • Exploring more ISL/OSL combinations

  • @@ -612,8 +614,11 @@ For NVIDIA Hopper GPUs, it’s recommended to use the FP8 version of the DeepSee YOUR_MODEL_PATH=<YOUR_MODEL_PATH> cd $YOUR_MODEL_PATH -## Download FP4 model for Blackwell GPUs -git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4 +## Download NVFP4 model for Blackwell GPUs +git clone https://huggingface.co/nvidia/DeepSeek-R1-NVFP4-v2 + +## Or the 0528 version +git clone https://huggingface.co/nvidia/DeepSeek-R1-0528-NVFP4-v2 ## Download FP8 model for Hopper GPUs ## FP8 model also works for Blackwell, but FP4 has the best performance on Blackwell. @@ -784,13 +789,13 @@ trtllm-bench --model nvidia/DeepS

    Benchmark#

    To do the benchmark, run the following command:

    # generate synthetic dataset
    -python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
    -        --stdout \
    -        --tokenizer nvidia/DeepSeek-R1-FP4 \
    +trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
    +        prepare-dataset \
    +        --output dataset.txt \
             token-norm-dist \
             --input-mean 1024 --output-mean 2048 \
             --input-stdev 0 --output-stdev 0 \
    -        --num-requests 49152 > dataset.txt
    +        --num-requests 49152
     
     YOUR_DATA_PATH=./dataset.txt
     
    @@ -888,13 +893,14 @@ trtllm-bench --model deepseek-ai/D
     

    Our benchmark results are based on Batch = 1024, ISL = 1K, OSL = 2K, num_requests = 5120 from real dataset To do the benchmark, run the following command:

    # generate synthetic dataset
    -python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
    -        --stdout \
    -        --tokenizer deepseek-ai/DeepSeek-R1 \
    +trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
    +        prepare-dataset \
    +        --output dataset.txt \
             token-norm-dist \
             --input-mean 1024 --output-mean 2048 \
             --input-stdev 0 --output-stdev 0 \
    -        --num-requests 5120 > dataset.txt
    +        --num-requests 5120
    +
     YOUR_DATA_PATH=./dataset.txt
     
     cat >./extra-llm-api-config.yml<<EOF
    @@ -941,10 +947,10 @@ trtllm-bench -m deepseek-ai/DeepSe
     
     

    Exploring more ISL/OSL combinations#

    -

    To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use prepare_dataset.py to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.

    +

    To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use the trtllm-bench prepare-dataset subcommand to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.

    WIP: Enable more features by default#

    -

    Currently, there are some features that need to be enabled through a user-defined file extra-llm-api-config.yml, such as CUDA graph, overlap scheduler and attention dp. We’re working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.

    +

    Currently, there are some features that need to be enabled through a user-defined file extra-llm-api-config.yml, such as attention dp. We’re working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.

    Note that, max_batch_size and max_num_tokens can easily affect the performance. The default values for them are already carefully designed and should deliver good performance on overall cases, however, you may still need to tune it for peak performance.

    Generally, you should make sure that max_batch_size is not too low to bottleneck the throughput, and max_num_tokens needs to be large enough so that it covers the max input sequence length of the samples in dataset, as mentioned in below section “WIP: Chunked context support on DeepSeek models”.

    For more details on max_batch_size and max_num_tokens, refer to Tuning Max Batch Size and Max Num Tokens.

    @@ -1142,9 +1148,9 @@ trtllm-bench -m deepseek-ai/DeepSe diff --git a/latest/blogs/Falcon180B-H200.html b/latest/blogs/Falcon180B-H200.html index b6d8714df6..5b2bd0fb2e 100644 --- a/latest/blogs/Falcon180B-H200.html +++ b/latest/blogs/Falcon180B-H200.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -792,9 +794,9 @@ ISL = Input Sequence Length diff --git a/latest/blogs/H100vsA100.html b/latest/blogs/H100vsA100.html index df0126c3d4..5c36b72892 100644 --- a/latest/blogs/H100vsA100.html +++ b/latest/blogs/H100vsA100.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -764,9 +766,9 @@ diff --git a/latest/blogs/H200launch.html b/latest/blogs/H200launch.html index 141a3c5120..c3f3ac0b4b 100644 --- a/latest/blogs/H200launch.html +++ b/latest/blogs/H200launch.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -756,9 +758,9 @@ TensorRT LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8.

    diff --git a/latest/blogs/XQA-kernel.html b/latest/blogs/XQA-kernel.html index 499db3c0cc..4c14c8754e 100644 --- a/latest/blogs/XQA-kernel.html +++ b/latest/blogs/XQA-kernel.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -723,9 +725,9 @@ ISL = Input Sequence Length diff --git a/latest/blogs/quantization-in-TRT-LLM.html b/latest/blogs/quantization-in-TRT-LLM.html index d847c48431..5e2b5349a5 100644 --- a/latest/blogs/quantization-in-TRT-LLM.html +++ b/latest/blogs/quantization-in-TRT-LLM.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -858,9 +860,9 @@ diff --git a/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html b/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html index 1490d95381..45a685a126 100644 --- a/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html +++ b/latest/blogs/tech_blog/blog10_ADP_Balance_Strategy.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1105,9 +1107,9 @@ The Pareto frontier analysis provides critical insights for real-world deploymen diff --git a/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html b/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html index 2255aff912..7ad6cd1172 100644 --- a/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html +++ b/latest/blogs/tech_blog/blog11_GPT_OSS_Eagle3.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -798,9 +800,9 @@ cat > /config/models/eagle/eagl diff --git a/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html b/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html index 765c4f42f7..5f7b666b8d 100644 --- a/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html +++ b/latest/blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1019,9 +1021,9 @@ diff --git a/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html b/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html index 6b8f5abc6f..7be3ab8af2 100644 --- a/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html +++ b/latest/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1086,9 +1088,9 @@ is a certainty-based, training-free approach to accelerate Chain-of-Thought (CoT diff --git a/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html b/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html index 1e37727c86..f353f696d2 100644 --- a/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html +++ b/latest/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -578,7 +580,7 @@

    wo GEMM FP4 quantization#

    The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1’s MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, “wo” is the abbreviation for the weight matrix for the output.

    -

    We’ve evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The NVIDIA TensorRT Model Optimizer team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:

    +

    We’ve evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The NVIDIA Model Optimizer team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:

    • https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2

    • https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2

    • @@ -905,9 +907,9 @@ However, since Q is in BF16 format, FMHA will also be performed in BF16 format, diff --git a/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html b/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html index 82fe6ddea3..13e78dd23d 100644 --- a/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html +++ b/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
    • Curl Chat Client
    • Curl Chat Client For Multimodal
    • Curl Completion Client
    • +
    • Curl Responses Client
    • Deepseek R1 Reasoning Parser
    • Genai Perf Client
    • Genai Perf Client For Multimodal
    • @@ -368,6 +369,7 @@
    • OpenAI Completion Client
    • Openai Completion Client For Lora
    • OpenAI Completion Client with JSON Schema
    • +
    • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -633,7 +635,7 @@

    *TensorRT LLM already supports FP8 Attention while for this latency scenario low-precision attention computation doesn’t help with performance so we choose to use bf16 precision for the Attention Modules.

    -

    ** nvfp4 model checkpoint is generated by the NVIDIA TensorRT Model Optimizer toolkit.

    +

    ** nvfp4 model checkpoint is generated by the NVIDIA Model Optimizer toolkit.

    *** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability

    @@ -1199,9 +1201,9 @@ diff --git a/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html b/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html index 4af90cf90b..7a89ed19f3 100644 --- a/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html +++ b/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -944,9 +946,9 @@ trtllm-bench --model nvidia/DeepSe diff --git a/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html b/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html index ce6fab6341..ccf5ca3729 100644 --- a/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html +++ b/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -559,7 +561,7 @@
  • FP8 KV cache and FP8 attention, rather than BF16 precision.

  • FP4 Allgather for better communication bandwidth utilization.

  • -

    The checkpoint used in this blog is hosted in nvidia/DeepSeek-R1-FP4, generated by NVIDIA Model Optimizer. The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:

    +

    The checkpoint used in this blog is hosted in nvidia/DeepSeek-R1-FP4, generated by NVIDIA Model Optimizer. The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:

    @@ -921,9 +923,9 @@ Running the shared and routed experts in 2 streams combined with other multi-str diff --git a/latest/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html b/latest/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html index bee5898530..813562cbbc 100644 --- a/latest/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html +++ b/latest/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1465,9 +1467,9 @@ Based on our current performance analysis, when you plan to apply large-scale EP diff --git a/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html b/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html index 46c999ff17..36b6bb09b6 100644 --- a/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html +++ b/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -979,9 +981,9 @@ trtllm-serve disaggregated -c diff --git a/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html b/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html index 01faf0f8e6..615f170282 100644 --- a/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html +++ b/latest/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -822,9 +824,9 @@ diff --git a/latest/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html b/latest/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html index 12071d1199..bcca8e91c5 100644 --- a/latest/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html +++ b/latest/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -944,9 +946,9 @@ N-Gram with k diff --git a/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html b/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html index 9e5be5defb..81e61abf84 100644 --- a/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html +++ b/latest/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -989,9 +991,9 @@ always defer defer+madvise diff --git a/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html b/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html index 1c87110201..63960202da 100644 --- a/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html +++ b/latest/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1014,9 +1016,9 @@ others according to your needs.

    diff --git a/latest/commands/trtllm-bench.html b/latest/commands/trtllm-bench.html index 7ca9d47370..5a54d0108a 100644 --- a/latest/commands/trtllm-bench.html +++ b/latest/commands/trtllm-bench.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1354,9 +1356,9 @@ diff --git a/latest/commands/trtllm-build.html b/latest/commands/trtllm-build.html index 5b6e0b276e..aa4e855a53 100644 --- a/latest/commands/trtllm-build.html +++ b/latest/commands/trtllm-build.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1021,9 +1023,9 @@ diff --git a/latest/commands/trtllm-eval.html b/latest/commands/trtllm-eval.html index 52385af072..90e128480f 100644 --- a/latest/commands/trtllm-eval.html +++ b/latest/commands/trtllm-eval.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1463,9 +1465,9 @@ trtllm-eval --model meta-llama/Lla diff --git a/latest/commands/trtllm-serve/index.html b/latest/commands/trtllm-serve/index.html index 76484f9e65..d92b689469 100644 --- a/latest/commands/trtllm-serve/index.html +++ b/latest/commands/trtllm-serve/index.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -652,9 +654,9 @@ diff --git a/latest/commands/trtllm-serve/run-benchmark-with-trtllm-serve.html b/latest/commands/trtllm-serve/run-benchmark-with-trtllm-serve.html index c7fb4c5f66..844d7060c9 100644 --- a/latest/commands/trtllm-serve/run-benchmark-with-trtllm-serve.html +++ b/latest/commands/trtllm-serve/run-benchmark-with-trtllm-serve.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -548,7 +550,7 @@ A complete reference for the API is available in the #

    TensorRT LLM distributes the pre-built container on NGC Catalog.

    You can launch the container using the following command:

    -
    docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5
    +
    docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6
     
    @@ -1018,9 +1020,9 @@ trtllm-serve ${m diff --git a/latest/commands/trtllm-serve/trtllm-serve.html b/latest/commands/trtllm-serve/trtllm-serve.html index 2e49705978..00133d0b40 100644 --- a/latest/commands/trtllm-serve/trtllm-serve.html +++ b/latest/commands/trtllm-serve/trtllm-serve.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -551,7 +553,7 @@

    Inference Endpoints#

    -

    After you start the server, you can send inference requests through completions API and Chat API, which are compatible with corresponding OpenAI APIs. We use TinyLlama-1.1B-Chat-v1.0 for examples in the following sections.

    +

    After you start the server, you can send inference requests through completions API, Chat API and Responses API, which are compatible with corresponding OpenAI APIs. We use TinyLlama-1.1B-Chat-v1.0 for examples in the following sections.

    Chat API#

    You can query Chat API with any http clients, a typical example is OpenAI Python client:

    @@ -627,6 +629,40 @@
    +
    +

    Responses API#

    +

    You can query Responses API with any http clients, a typical example is OpenAI Python client:

    +
     1### :title OpenAI Responses Client
    + 2
    + 3from openai import OpenAI
    + 4
    + 5client = OpenAI(
    + 6    base_url="http://localhost:8000/v1",
    + 7    api_key="tensorrt_llm",
    + 8)
    + 9
    +10response = client.responses.create(
    +11    model="TinyLlama-1.1B-Chat-v1.0",
    +12    input="Where is New York?",
    +13    max_output_tokens=20,
    +14)
    +15print(response)
    +
    +
    +

    Another example uses curl:

    +
    1#! /usr/bin/env bash
    +2
    +3curl http://localhost:8000/v1/responses \
    +4    -H "Content-Type: application/json" \
    +5    -d '{
    +6        "model": "TinyLlama-1.1B-Chat-v1.0",
    +7        "input": "Where is New York?",
    +8        "max_output_tokens": 16
    +9    }'
    +
    +
    +

    More openai compatible examples can be found in the compatibility examples directory.

    +

    Multimodal Serving#

    For multimodal models, you need to create a configuration file and start the server with additional options due to the following limitations:

    @@ -1197,7 +1233,7 @@ Since the statistics are stored in an internal queue and removed once retrieved,

    [Experimental] Specify the parser for reasoning models.

    Options:
    -

    deepseek-r1 | qwen3

    +

    deepseek-r1 | qwen3 | nano-v3

    @@ -1208,7 +1244,7 @@ Since the statistics are stored in an internal queue and removed once retrieved,

    [Experimental] Specify the parser for tool models.

    Options:
    -

    qwen3 | qwen3_coder

    +

    qwen3 | qwen3_coder | kimi_k2

    @@ -1329,6 +1365,7 @@ Please refer to Performance Benchmarking with `trtllm-serve <Inference Endpoints

    Precision

    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    8xB200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml

    8xB200_NVL

    Balanced

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml

    8xB200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml

    8xB200_NVL

    Balanced

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml

    8xH200_SXM

    Min Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml

    8xH200_SXM

    Balanced

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml

    8xH200_SXM

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml

    8xH200_SXM

    Min Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml

    8xH200_SXM

    Balanced

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml

    8xH200_SXM

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml

    +
    +
    +
    +

    DeepSeek-R1 (NVFP4)#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    4xB200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    128

    1k1k_tp4_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    128

    1k1k_tp8_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    256

    1k1k_tp4_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    256

    1k1k_tp8_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml

    4xB200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    128

    8k1k_tp4_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    128

    8k1k_tp8_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    256

    8k1k_tp4_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    256

    8k1k_tp8_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml

    +
    +
    +
    +

    gpt-oss-120b#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    B200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml

    B200_NVL

    Min Latency

    1024 / 8192

    4

    1k8k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    1024 / 8192

    16

    1k8k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    16

    1k8k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    16

    1k8k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 8192

    16

    1k8k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    1024 / 8192

    64

    1k8k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml

    B200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    1024 / 1024

    4

    1k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    1024 / 1024

    16

    1k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    16

    1k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    1024 / 8192

    4

    1k8k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    1024 / 8192

    16

    1k8k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    16

    1k8k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    16

    1k8k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 8192

    16

    1k8k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    1024 / 8192

    64

    1k8k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    8192 / 1024

    4

    8k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    8192 / 1024

    16

    8k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    16

    8k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml

    +
    +
    + + +
    + + + + + + + + + + + + + + +
    + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html index 6672b8be62..7352dc7ef3 100644 --- a/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -595,7 +597,7 @@ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash @@ -612,7 +614,7 @@ nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 Recommended Performance Settings#

    We maintain YAML configuration files with recommended performance settings in the examples/configs directory. These config files are present in the TensorRT LLM container at the path /app/tensorrt_llm/examples/configs. You can use these out-of-the-box, or adjust them to your specific use case.

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-throughput.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -644,7 +646,7 @@ cat << EOF > ${EXTRA_LLM_API_FIL

    To use the DeepGEMM MOE backend on B200/GB200, use this config instead:

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/deepseek-r1-deepgemm.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -729,7 +731,9 @@ cat << EOF > ${EXTRA_LLM_API_FIL

    trust_remote_code#

    -

    Description: Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.

    +
      +
    • Description: Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.

    • +

    kv_cache_config#

    @@ -1008,6 +1012,410 @@ chmod +x bench.sh
    +
    +

    Preconfigured Recipes#

    +

    The following tables list recommended configurations from the comprehensive database for different performance profiles.

    +
    +

    Note

    +

    Traffic Patterns: The ISL (Input Sequence Length) and OSL (Output Sequence Length) +values in each configuration represent the maximum supported values for that config. +Requests exceeding these limits may result in errors.

    +

    To handle requests with input sequences longer than the configured ISL, add the following +to your config file:

    +
    enable_chunked_prefill: true
    +
    +
    +

    This enables chunked prefill, which processes long input sequences in chunks rather than +requiring them to fit within a single prefill operation. Note that enabling chunked prefill +does not guarantee optimal performance—these configs are tuned for the specified ISL/OSL.

    +
    +
    +

    DeepSeek-R1#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    8xB200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml

    8xB200_NVL

    Balanced

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml

    8xB200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml

    8xB200_NVL

    Balanced

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml

    8xH200_SXM

    Min Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml

    8xH200_SXM

    Balanced

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml

    8xH200_SXM

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml

    8xH200_SXM

    Min Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml

    8xH200_SXM

    Balanced

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml

    8xH200_SXM

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml

    +
    +
    +
    +

    DeepSeek-R1 (NVFP4)#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    4xB200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    128

    1k1k_tp4_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    128

    1k1k_tp8_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    256

    1k1k_tp4_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    256

    1k1k_tp8_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml

    4xB200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    128

    8k1k_tp4_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    128

    8k1k_tp8_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    256

    8k1k_tp4_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    256

    8k1k_tp8_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml

    +
    +
    +
    @@ -1101,6 +1509,11 @@ chmod +x bench.sh +
  • Preconfigured Recipes +
  • @@ -1192,9 +1605,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html index a3dcb09b28..be309fdff3 100644 --- a/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -583,7 +585,7 @@ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash @@ -601,7 +603,7 @@ nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 We maintain YAML configuration files with recommended performance settings in the examples/configs directory. These config files are present in the TensorRT LLM container at the path /app/tensorrt_llm/examples/configs. You can use these out-of-the-box, or adjust them to your specific use case.

    For low-latency use cases:

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-latency.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -631,7 +633,7 @@ cat << EOF > ${EXTRA_LLM_API_FIL

    For max-throughput use cases:

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/gpt-oss-120b-throughput.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -970,6 +972,889 @@ chmod +x bench.sh +
    +

    Preconfigured Recipes#

    +

    The following table lists recommended configurations from the comprehensive database for different performance profiles.

    +
    +

    Note

    +

    Traffic Patterns: The ISL (Input Sequence Length) and OSL (Output Sequence Length) +values in each configuration represent the maximum supported values for that config. +Requests exceeding these limits may result in errors.

    +

    To handle requests with input sequences longer than the configured ISL, add the following +to your config file:

    +
    enable_chunked_prefill: true
    +
    +
    +

    This enables chunked prefill, which processes long input sequences in chunks rather than +requiring them to fit within a single prefill operation. Note that enabling chunked prefill +does not guarantee optimal performance—these configs are tuned for the specified ISL/OSL.

    +
    +
    +

    gpt-oss-120b#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    B200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml

    B200_NVL

    Min Latency

    1024 / 8192

    4

    1k8k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    1024 / 8192

    16

    1k8k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    16

    1k8k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    16

    1k8k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 8192

    16

    1k8k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    1024 / 8192

    64

    1k8k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml

    B200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    1024 / 1024

    4

    1k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    1024 / 1024

    16

    1k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    16

    1k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    1024 / 8192

    4

    1k8k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    1024 / 8192

    16

    1k8k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    16

    1k8k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    16

    1k8k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 8192

    16

    1k8k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    1024 / 8192

    64

    1k8k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    8192 / 1024

    4

    8k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    8192 / 1024

    16

    8k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    16

    8k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml

    +
    +
    +
    @@ -1060,6 +1945,10 @@ chmod +x bench.sh +
  • Preconfigured Recipes +
  • @@ -1151,9 +2040,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html index 4fb0ef600e..186a0c0598 100644 --- a/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -812,6 +814,20 @@ chmod +x bench.sh +
    +

    Troubleshooting#

    +

    Since Kimi K2 Thinking has larger weight size than other models, it’s possible seeing host OOM issues, as the following:

    +
    Loading weights: 100%|█████████████████████| 1408/1408 [03:43<00:00,  6.30it/s]
    + 0: [12/04/2025-18:38:28] [TRT-LLM] [RANK 0] [I] moe_load_balancer finalizing model...
    + 1: [nvl72136-T14:452151:0:452151] Caught signal 7 (Bus error: nonexistent physical address)
    + 1: ==== backtrace (tid: 452151) ====
    + 1:  0  /usr/local/ucx//lib/libucs.so.0(ucs_handle_error+0x2cc) [0xffff9638274c]
    + 1:  1  /usr/local/ucx//lib/libucs.so.0(+0x328fc) [0xffff963828fc]
    + 1:  2  /usr/local/ucx//lib/libucs.so.0(+0x32c78) [0xffff96382c78]
    +
    +
    +

    This can be addressed by mounting tmpfs:/dev/shm:size=640G when launching the Docker container, to increase the shm size that the container can access.

    +
    @@ -874,6 +890,7 @@ chmod +x bench.sh
  • Deploy Kimi K2 Thinking on GB200 NVL72 through SLURM with wide EP and disaggregated serving
  • Query the OpenAI-compatible API Endpoint
  • Benchmark
  • +
  • Troubleshooting
  • @@ -965,9 +982,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html index 82efa16aeb..0eb5921175 100644 --- a/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -562,7 +564,7 @@ Python3 and python3-pip (Optional, for accuracy evaluation only)

    -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash @@ -579,7 +581,7 @@ nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 Recommended Performance Settings#

    We maintain YAML configuration files with recommended performance settings in the examples/configs directory. These config files are present in the TensorRT LLM container at the path /app/tensorrt_llm/examples/configs. You can use these out-of-the-box, or adjust them to your specific use case.

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-3.3-70b.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -1070,9 +1072,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html index 6f92d23017..b2cb805afe 100644 --- a/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -564,7 +566,7 @@ -p 8000:8000 \ -v ~/.cache:/root/.cache:rw \ --name tensorrt_llm \ -nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \ +nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6 \ /bin/bash @@ -581,7 +583,7 @@ nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 Recommended Performance Settings#

    We maintain YAML configuration files with recommended performance settings in the examples/configs directory. These config files are present in the TensorRT LLM container at the path /app/tensorrt_llm/examples/configs. You can use these out-of-the-box, or adjust them to your specific use case.

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/llama-4-scout.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -1098,9 +1100,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html index fb316de6ec..72ce5438bc 100644 --- a/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -561,7 +563,7 @@

    Recommended Performance Settings#

    We maintain YAML configuration files with recommended performance settings in the examples/configs directory. These config files are present in the TensorRT LLM container at the path /app/tensorrt_llm/examples/configs. You can use these out-of-the-box, or adjust them to your specific use case.

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3-next.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -934,9 +936,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html b/latest/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html index e828ebd17d..8f45049d4e 100644 --- a/latest/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html +++ b/latest/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -565,7 +567,7 @@ make -C docker Recommended Performance Settings#

    We maintain YAML configuration files with recommended performance settings in the examples/configs directory. These config files are present in the TensorRT LLM container at the path /app/tensorrt_llm/examples/configs. You can use these out-of-the-box, or adjust them to your specific use case.

    TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
    -EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3.yaml
    +EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml
     

    Note: if you don’t have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.

    @@ -968,9 +970,9 @@ chmod +x bench.sh diff --git a/latest/deployment-guide/index.html b/latest/deployment-guide/index.html index ebddeae0cd..f05d2fd045 100644 --- a/latest/deployment-guide/index.html +++ b/latest/deployment-guide/index.html @@ -63,7 +63,7 @@ @@ -78,7 +78,7 @@ - + @@ -362,6 +362,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -370,6 +371,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -526,13 +528,18 @@ +
    +

    Comprehensive Configuration Database#

    +

    The table below lists all available pre-configured model scenarios in the TensorRT LLM configuration database. Each row represents a specific model, GPU, and performance profile combination with recommended request settings.

    +
    +

    Note

    +

    Traffic Patterns: The ISL (Input Sequence Length) and OSL (Output Sequence Length) +values in each configuration represent the maximum supported values for that config. +Requests exceeding these limits may result in errors.

    +

    To handle requests with input sequences longer than the configured ISL, add the following +to your config file:

    +
    enable_chunked_prefill: true
    +
    +
    +

    This enables chunked prefill, which processes long input sequences in chunks rather than +requiring them to fit within a single prefill operation. Note that enabling chunked prefill +does not guarantee optimal performance—these configs are tuned for the specified ISL/OSL.

    +
    +
    +

    DeepSeek-R1#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    8xB200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml

    8xB200_NVL

    Balanced

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml

    8xB200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml

    8xB200_NVL

    Balanced

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml

    8xH200_SXM

    Min Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml

    8xH200_SXM

    Balanced

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml

    8xH200_SXM

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml

    8xH200_SXM

    Min Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml

    8xH200_SXM

    Balanced

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml

    8xH200_SXM

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml

    +
    +
    +
    +

    DeepSeek-R1 (NVFP4)#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    4xB200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    128

    1k1k_tp4_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    128

    1k1k_tp8_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    256

    1k1k_tp4_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    256

    1k1k_tp8_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml

    4xB200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    128

    8k1k_tp4_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    128

    8k1k_tp8_conc128.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    256

    8k1k_tp4_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    256

    8k1k_tp8_conc256.yaml

    trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml

    +
    +
    +
    +

    gpt-oss-120b#

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    GPU

    Performance Profile

    ISL / OSL

    Concurrency

    Config

    Command

    B200_NVL

    Min Latency

    1024 / 1024

    4

    1k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    1024 / 1024

    16

    1k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml

    B200_NVL

    Min Latency

    1024 / 8192

    4

    1k8k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    1024 / 8192

    4

    1k8k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    1024 / 8192

    8

    1k8k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    1024 / 8192

    16

    1k8k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    1024 / 8192

    16

    1k8k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    16

    1k8k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    1024 / 8192

    16

    1k8k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    1024 / 8192

    32

    1k8k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    1024 / 8192

    64

    1k8k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    1024 / 8192

    64

    1k8k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml

    B200_NVL

    Min Latency

    8192 / 1024

    4

    8k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml

    B200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml

    4xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml

    8xB200_NVL

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml

    B200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml

    2xB200_NVL

    Low Latency

    8192 / 1024

    16

    8k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml

    B200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml

    2xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml

    8xB200_NVL

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml

    B200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml

    2xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml

    4xB200_NVL

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml

    8xB200_NVL

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    1024 / 1024

    4

    1k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    4

    1k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    1024 / 1024

    8

    1k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    1024 / 1024

    16

    1k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    1024 / 1024

    16

    1k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    16

    1k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    16

    1k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    1024 / 1024

    32

    1k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    1024 / 1024

    64

    1k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    1024 / 1024

    64

    1k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    1024 / 8192

    4

    1k8k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    1024 / 8192

    4

    1k8k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    1024 / 8192

    8

    1k8k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    1024 / 8192

    16

    1k8k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    1024 / 8192

    16

    1k8k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    16

    1k8k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    1024 / 8192

    16

    1k8k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    1024 / 8192

    32

    1k8k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    1024 / 8192

    64

    1k8k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    1024 / 8192

    64

    1k8k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml

    H200_SXM

    Min Latency

    8192 / 1024

    4

    8k1k_tp1_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp2_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml

    4xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp4_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    4

    8k1k_tp8_conc4.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml

    H200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp1_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp2_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml

    4xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp4_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml

    8xH200_SXM

    Low Latency

    8192 / 1024

    8

    8k1k_tp8_conc8.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml

    H200_SXM

    Low Latency

    8192 / 1024

    16

    8k1k_tp1_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml

    2xH200_SXM

    Low Latency

    8192 / 1024

    16

    8k1k_tp2_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    16

    8k1k_tp4_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    16

    8k1k_tp8_conc16.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml

    H200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp1_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml

    2xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp2_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp4_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml

    8xH200_SXM

    High Throughput

    8192 / 1024

    32

    8k1k_tp8_conc32.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml

    H200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp1_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml

    2xH200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp2_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml

    4xH200_SXM

    High Throughput

    8192 / 1024

    64

    8k1k_tp4_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml

    8xH200_SXM

    Max Throughput

    8192 / 1024

    64

    8k1k_tp8_conc64.yaml

    trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml

    +
    +
    +
    @@ -684,6 +1960,12 @@ @@ -775,9 +2057,9 @@ diff --git a/latest/deployment-guide/note_sections.html b/latest/deployment-guide/note_sections.html new file mode 100644 index 0000000000..f6ef69afbf --- /dev/null +++ b/latest/deployment-guide/note_sections.html @@ -0,0 +1,678 @@ + + + + + + + + + + + + <no title> — TensorRT LLM + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + +
    + +
    + + + + + +
    +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    + +
    + +
    + + +
    + +
    + + +
    +
    + + + + + +
    + +
    +

    Note

    +

    Traffic Patterns: The ISL (Input Sequence Length) and OSL (Output Sequence Length) +values in each configuration represent the maximum supported values for that config. +Requests exceeding these limits may result in errors.

    +

    To handle requests with input sequences longer than the configured ISL, add the following +to your config file:

    +
    enable_chunked_prefill: true
    +
    +
    +

    This enables chunked prefill, which processes long input sequences in chunks rather than +requiring them to fit within a single prefill operation. Note that enabling chunked prefill +does not guarantee optimal performance—these configs are tuned for the specified ISL/OSL.

    +
    +
    +

    Note

    +

    The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the Comprehensive Configuration Database section below which covers a larger set of traffic patterns and performance profiles.

    +
    + + +
    + + + + + +
    + +
    +
    +
    + +
    + + + + + + +
    + + + +
    +
    + +
    + +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/latest/developer-guide/api-change.html b/latest/developer-guide/api-change.html index 000ef90c1c..80b5fb9083 100644 --- a/latest/developer-guide/api-change.html +++ b/latest/developer-guide/api-change.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1039,9 +1041,9 @@ python -m pytest < diff --git a/latest/developer-guide/ci-overview.html b/latest/developer-guide/ci-overview.html index 95b5649479..42ab10d7b2 100644 --- a/latest/developer-guide/ci-overview.html +++ b/latest/developer-guide/ci-overview.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -795,9 +797,9 @@ selective keeps CI turnaround fast and conserves hardware resources.

    diff --git a/latest/developer-guide/dev-containers.html b/latest/developer-guide/dev-containers.html index f57aec97fe..c412f99ef9 100644 --- a/latest/developer-guide/dev-containers.html +++ b/latest/developer-guide/dev-containers.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -757,9 +759,9 @@ initialization script will create one with the contents listed above.

    diff --git a/latest/developer-guide/kv-transfer.html b/latest/developer-guide/kv-transfer.html index 8cfab70ad3..055d43edb9 100644 --- a/latest/developer-guide/kv-transfer.html +++ b/latest/developer-guide/kv-transfer.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -755,9 +757,9 @@ diff --git a/latest/developer-guide/overview.html b/latest/developer-guide/overview.html index cec2b84e3f..7cf715168f 100644 --- a/latest/developer-guide/overview.html +++ b/latest/developer-guide/overview.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -723,9 +725,9 @@ diff --git a/latest/developer-guide/perf-analysis.html b/latest/developer-guide/perf-analysis.html index e55977f10c..d86a0e262c 100644 --- a/latest/developer-guide/perf-analysis.html +++ b/latest/developer-guide/perf-analysis.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -586,10 +588,12 @@
    #!/bin/bash
     
     # Prepare dataset for the benchmark
    -python3 benchmarks/cpp/prepare_dataset.py \
    -    --tokenizer=${MODEL_PATH} \
    -    --stdout token-norm-dist --num-requests=${NUM_SAMPLES} \
    -    --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
    +trtllm-bench --model ${MODEL_PATH} \
    +    prepare-dataset \
    +    --output dataset.txt \
    +    token-norm-dist \
    +    --num-requests=${NUM_SAMPLES} \
    +    --input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0
     
     # Benchmark and profile
     TLLM_PROFILE_START_STOP=100-150 nsys profile \
    @@ -770,9 +774,9 @@ python3 benchmarks/cpp/prepare_dataset.py
             
           
    diff --git a/latest/developer-guide/perf-benchmarking.html b/latest/developer-guide/perf-benchmarking.html
    index 34534669f2..09142d9b69 100644
    --- a/latest/developer-guide/perf-benchmarking.html
    +++ b/latest/developer-guide/perf-benchmarking.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -360,6 +360,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -691,7 +693,7 @@ JSON entry is on every line.

    In order to prepare a synthetic dataset, you can use the provided script in the benchmarks/cpp directory. For example, to generate a synthetic dataset of 1000 requests with a uniform ISL/OSL of 128/128 for meta-llama/Llama-3.1-8B, run:

    -
    python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt
    +
    trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synthetic_128_128.txt token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000
     
    @@ -765,11 +767,11 @@ Total Latency (Benchmarking with LoRA Adapters in PyTorch workflow#

    The PyTorch workflow supports benchmarking with LoRA (Low-Rank Adaptation) adapters. This requires preparing a dataset with LoRA metadata and configuring the LoRA settings.

    Preparing LoRA Dataset

    -

    Use prepare_dataset.py with LoRA-specific options to generate requests with LoRA metadata:

    -
    python3 benchmarks/cpp/prepare_dataset.py \
    -  --stdout \
    +

    Use trtllm-bench prepare-dataset with LoRA-specific options to generate requests with LoRA metadata:

    +
    trtllm-bench \
    +  --model /path/to/tokenizer \
    +  prepare-dataset \
       --rand-task-id 0 1 \
    -  --tokenizer /path/to/tokenizer \
       --lora-dir /path/to/loras \
       token-norm-dist \
       --num-requests 100 \
    @@ -834,16 +836,17 @@ Each subdirectory should contain the LoRA adapter files for that specific task.<
     

    Running multi-modal models in the PyTorch Workflow#

    To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above.

    First, prepare the dataset:

    -
    python ./benchmarks/cpp/prepare_dataset.py \
    -  --tokenizer Qwen/Qwen2-VL-2B-Instruct \
    -  --stdout \
    -  dataset \
    -  --dataset-name lmms-lab/MMMU \
    -  --dataset-split test \
    -  --dataset-image-key image \
    -  --dataset-prompt-key question \
    -  --num-requests 10 \
    -  --output-len-dist 128,5 > mm_data.jsonl
    +
    trtllm-bench \
    +  --model Qwen/Qwen2-VL-2B-Instruct \
    +  prepare-dataset \
    +  --output mm_data.jsonl
    +  real-dataset
    +  --dataset-name lmms-lab/MMMU \
    +  --dataset-split test \
    +  --dataset-image-key image \
    +  --dataset-prompt-key question \
    +  --num-requests 10 \
    +  --output-len-dist 128,5
     

    It will download the media files to /tmp directory and prepare the dataset with their paths. Note that the prompt fields are texts and not tokenized ids. This is due to the fact that @@ -944,9 +947,9 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp

  • nvidia/Llama-3.1-70B-Instruct-FP8

  • nvidia/Llama-3.1-405B-Instruct-FP8

  • -

    To understand more about how to quantize your own checkpoints, refer to ModelOpt documentation.

    +

    To understand more about how to quantize your own checkpoints, refer to ModelOpt documentation.

    trtllm-bench utilizes the hf_quant_config.json file present in the pre-quantized checkpoints above. The configuration -file is present in checkpoints quantized with TensorRT Model Optimizer +file is present in checkpoints quantized with Model Optimizer and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints above:

    {
    @@ -1175,9 +1178,9 @@ when the checkpoint precision is 
     
    diff --git a/latest/developer-guide/perf-overview.html b/latest/developer-guide/perf-overview.html index db43ccba32..ec5fb499c7 100644 --- a/latest/developer-guide/perf-overview.html +++ b/latest/developer-guide/perf-overview.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -531,7 +533,7 @@ Tuning batch sizes, parallelism configurations, and other options may lead to im

    The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages), and shows the throughput scenario under maximum load. The reported metric is Total Output Throughput (tokens/sec).

    The performance numbers below were collected using the steps described in this document.

    -

    Testing was performed on models with weights quantized using ModelOpt and published by NVIDIA on the Model Optimizer HuggingFace Collection.

    +

    Testing was performed on models with weights quantized using ModelOpt and published by NVIDIA on the Model Optimizer HuggingFace Collection.

    (NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:

    RTX 6000 Pro Blackwell Server Edition data is now included in the perf overview. RTX 6000 systems can benefit from enabling pipeline parallelism (PP) in LLM workloads, so we included several new benchmarks for this GPU at various TP x PP combinations. That data is presented in a separate table for each network.

    @@ -1365,7 +1367,7 @@ nvidia/Qwen3-235B-A22B-FP8

    Preparing a Dataset#

    -

    In order to prepare a dataset, you can use the provided script. +

    In order to prepare a dataset, you can use the provided script. To generate a synthetic dataset, run the following command:

    python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file
     
    @@ -1445,7 +1447,7 @@ remain in the system longer and therefore require less requests to achieve stead

    Running the Benchmark#

    To run the benchmark with the generated data set, simply use the trtllm-bench throughput subcommand. The benchmarker will run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide -a model name (HuggingFace reference or path to a local model), a generated dataset, and a file containing any desired extra options to the LLM APIs (details in tensorrt_llm/llmapi/llm_args.py:LlmArgs).

    +a model name (HuggingFace reference or path to a local model), a generated dataset, and a file containing any desired extra options to the LLM APIs (details in tensorrt_llm/llmapi/llm_args.py:LlmArgs).

    For dense / non-MoE models:

    trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
     
    @@ -1698,9 +1700,9 @@ using the --kv_cach diff --git a/latest/examples/curl_chat_client.html b/latest/examples/curl_chat_client.html index d336289834..fdbbb789e1 100644 --- a/latest/examples/curl_chat_client.html +++ b/latest/examples/curl_chat_client.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Curl Chat Client#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#! /usr/bin/env bash
      2
      3curl http://localhost:8000/v1/chat/completions \
    @@ -662,9 +664,9 @@
             
           
    diff --git a/latest/examples/curl_chat_client_for_multimodal.html b/latest/examples/curl_chat_client_for_multimodal.html
    index 503eba1366..6e24fcb846 100644
    --- a/latest/examples/curl_chat_client_for_multimodal.html
    +++ b/latest/examples/curl_chat_client_for_multimodal.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Curl Chat Client For Multimodal#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#! /usr/bin/env bash
      2
      3# SINGLE IMAGE INFERENCE
    @@ -739,9 +741,9 @@
             
           
    diff --git a/latest/examples/curl_completion_client.html b/latest/examples/curl_completion_client.html
    index 863bdbf0d4..a9da98a9c8 100644
    --- a/latest/examples/curl_completion_client.html
    +++ b/latest/examples/curl_completion_client.html
    @@ -61,7 +61,7 @@
         
    @@ -70,13 +70,13 @@
     
         
         
    -    
    +    
         
     
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Curl Completion Client#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#! /usr/bin/env bash
      2
      3curl http://localhost:8000/v1/completions \
    @@ -556,11 +558,11 @@
           

    next

    -

    Deepseek R1 Reasoning Parser

    +

    Curl Responses Client

    @@ -661,9 +663,9 @@ diff --git a/latest/examples/curl_responses_client.html b/latest/examples/curl_responses_client.html new file mode 100644 index 0000000000..fe6cafd43f --- /dev/null +++ b/latest/examples/curl_responses_client.html @@ -0,0 +1,679 @@ + + + + + + + + + + + + Curl Responses Client — TensorRT LLM + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + +
    + +
    + + + + + +
    +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    + +
    + +
    + + +
    + +
    + + +
    +
    + + + + + +
    + +
    +

    Curl Responses Client#

    +

    Refer to the trtllm-serve documentation for starting a server.

    +

    Source NVIDIA/TensorRT-LLM.

    +
    1#! /usr/bin/env bash
    +2
    +3curl http://localhost:8000/v1/responses \
    +4    -H "Content-Type: application/json" \
    +5    -d '{
    +6        "model": "TinyLlama-1.1B-Chat-v1.0",
    +7        "input": "Where is New York?",
    +8        "max_output_tokens": 16
    +9    }'
    +
    +
    +
    + + +
    + + + + + + + +
    + + + +
    + + + + + +
    +
    + +
    + +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/latest/examples/customization.html b/latest/examples/customization.html index 63c45dbd36..05324cb733 100644 --- a/latest/examples/customization.html +++ b/latest/examples/customization.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -778,9 +780,9 @@ diff --git a/latest/examples/deepseek_r1_reasoning_parser.html b/latest/examples/deepseek_r1_reasoning_parser.html index 14cec96997..7762ec2935 100644 --- a/latest/examples/deepseek_r1_reasoning_parser.html +++ b/latest/examples/deepseek_r1_reasoning_parser.html @@ -61,7 +61,7 @@ @@ -71,12 +71,12 @@ - + - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Deepseek R1 Reasoning Parser#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#! /usr/bin/env bash
      2
      3cat >./extra-llm-api-config.yml <<EOF
    @@ -560,12 +562,12 @@
                       
     
           
    diff --git a/latest/examples/dynamo_k8s_example.html b/latest/examples/dynamo_k8s_example.html
    index 496d402689..9a32399099 100644
    --- a/latest/examples/dynamo_k8s_example.html
    +++ b/latest/examples/dynamo_k8s_example.html
    @@ -61,7 +61,7 @@
         
    @@ -71,12 +71,12 @@
         
         
         
    -    
    +    
     
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -544,12 +546,12 @@ for more details.

    previous

    -

    OpenAI Completion Client with JSON Schema

    +

    OpenAI Responses Client

    diff --git a/latest/examples/genai_perf_client.html b/latest/examples/genai_perf_client.html index 9e8d58eb1a..c1fb182e9f 100644 --- a/latest/examples/genai_perf_client.html +++ b/latest/examples/genai_perf_client.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Genai Perf Client#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#! /usr/bin/env bash
      2
      3genai-perf profile \
    @@ -667,9 +669,9 @@
             
           
    diff --git a/latest/examples/genai_perf_client_for_multimodal.html b/latest/examples/genai_perf_client_for_multimodal.html
    index 278235a68d..26e1c624d1 100644
    --- a/latest/examples/genai_perf_client_for_multimodal.html
    +++ b/latest/examples/genai_perf_client_for_multimodal.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Genai Perf Client For Multimodal#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#! /usr/bin/env bash
      2
      3genai-perf profile \
    @@ -670,9 +672,9 @@
             
           
    diff --git a/latest/examples/index.html b/latest/examples/index.html
    index b06e7db546..1e41d59632 100644
    --- a/latest/examples/index.html
    +++ b/latest/examples/index.html
    @@ -61,7 +61,7 @@
         
    @@ -74,7 +74,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -354,6 +354,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -362,6 +363,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -522,7 +524,7 @@ 4def main(): 5 6 # Model could accept HF model name, a path to local HF model, - 7 # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. + 7 # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. 8 llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") 9 10 # Sample prompts. @@ -668,9 +670,9 @@ diff --git a/latest/examples/kvcacheconfig.html b/latest/examples/kvcacheconfig.html index 915f0044a3..528d70e216 100644 --- a/latest/examples/kvcacheconfig.html +++ b/latest/examples/kvcacheconfig.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -354,6 +354,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -362,6 +363,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -663,9 +665,9 @@ diff --git a/latest/examples/kvcacheretentionconfig.html b/latest/examples/kvcacheretentionconfig.html index 8d7845be31..35f164afb4 100644 --- a/latest/examples/kvcacheretentionconfig.html +++ b/latest/examples/kvcacheretentionconfig.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -354,6 +354,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -362,6 +363,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -686,9 +688,9 @@ diff --git a/latest/examples/llm_api_examples.html b/latest/examples/llm_api_examples.html index 80abb3b323..583b7afe9c 100644 --- a/latest/examples/llm_api_examples.html +++ b/latest/examples/llm_api_examples.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -703,9 +705,9 @@ diff --git a/latest/examples/llm_guided_decoding.html b/latest/examples/llm_guided_decoding.html index d04edaa9c2..7b86591485 100644 --- a/latest/examples/llm_guided_decoding.html +++ b/latest/examples/llm_guided_decoding.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Generate text with guided decoding#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1from tensorrt_llm import LLM, SamplingParams
      2from tensorrt_llm.llmapi import GuidedDecodingParams
      3
    @@ -694,9 +696,9 @@
             
           
    diff --git a/latest/examples/llm_inference.html b/latest/examples/llm_inference.html
    index e439ddf604..1c5e14da63 100644
    --- a/latest/examples/llm_inference.html
    +++ b/latest/examples/llm_inference.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,14 +522,14 @@

    Generate text#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1from tensorrt_llm import LLM, SamplingParams
      2
      3
      4def main():
      5
      6    # Model could accept HF model name, a path to local HF model,
    - 7    # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
    + 7    # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
      8    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
      9
     10    # Sample prompts.
    @@ -682,9 +684,9 @@
             
           
    diff --git a/latest/examples/llm_inference_async.html b/latest/examples/llm_inference_async.html
    index b2702fc1f7..a55e8a4f2b 100644
    --- a/latest/examples/llm_inference_async.html
    +++ b/latest/examples/llm_inference_async.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Generate text asynchronously#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1import asyncio
      2
      3from tensorrt_llm import LLM, SamplingParams
    @@ -690,9 +692,9 @@
             
           
    diff --git a/latest/examples/llm_inference_async_streaming.html b/latest/examples/llm_inference_async_streaming.html
    index 20991e70e0..e9a520516f 100644
    --- a/latest/examples/llm_inference_async_streaming.html
    +++ b/latest/examples/llm_inference_async_streaming.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Generate text in streaming#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1import asyncio
      2
      3from tensorrt_llm import LLM, SamplingParams
    @@ -711,9 +713,9 @@
             
           
    diff --git a/latest/examples/llm_inference_distributed.html b/latest/examples/llm_inference_distributed.html
    index 79e1858589..4aee881734 100644
    --- a/latest/examples/llm_inference_distributed.html
    +++ b/latest/examples/llm_inference_distributed.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Distributed LLM Generation#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1from tensorrt_llm import LLM, SamplingParams
      2
      3
    @@ -691,9 +693,9 @@
             
           
    diff --git a/latest/examples/llm_kv_cache_connector.html b/latest/examples/llm_kv_cache_connector.html
    index d039b9ebe0..61966fcbaa 100644
    --- a/latest/examples/llm_kv_cache_connector.html
    +++ b/latest/examples/llm_kv_cache_connector.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    KV Cache Connector#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1'''
       2This script demonstrates the KV cache connector feature in TensorRT-LLM, which enables
       3custom persistence and reuse of KV cache blocks across different LLM instances.
    @@ -973,9 +975,9 @@
             
           
    diff --git a/latest/examples/llm_kv_cache_offloading.html b/latest/examples/llm_kv_cache_offloading.html
    index efed8fbde2..52d712d811 100644
    --- a/latest/examples/llm_kv_cache_offloading.html
    +++ b/latest/examples/llm_kv_cache_offloading.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    KV Cache Offloading#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1'''
       2This script demonstrates the effectiveness of KV cache host offloading in TensorRT-LLM.
       3
    @@ -781,9 +783,9 @@
             
           
    diff --git a/latest/examples/llm_logits_processor.html b/latest/examples/llm_logits_processor.html
    index 72508a8cc9..62c64b1378 100644
    --- a/latest/examples/llm_logits_processor.html
    +++ b/latest/examples/llm_logits_processor.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Control generated text using logits processor#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1from typing import List, Optional
       2
       3import torch
    @@ -775,9 +777,9 @@
             
           
    diff --git a/latest/examples/llm_mgmn_llm_distributed.html b/latest/examples/llm_mgmn_llm_distributed.html
    index e297fdac3a..efa3d7e8e6 100644
    --- a/latest/examples/llm_mgmn_llm_distributed.html
    +++ b/latest/examples/llm_mgmn_llm_distributed.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Run LLM-API with pytorch backend on Slurm#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#!/bin/bash
      2#SBATCH -A <account>    # parameter
      3#SBATCH -p <partition>  # parameter
    @@ -741,9 +743,9 @@
             
           
    diff --git a/latest/examples/llm_mgmn_trtllm_bench.html b/latest/examples/llm_mgmn_trtllm_bench.html
    index 9b74c6f68a..a51717638d 100644
    --- a/latest/examples/llm_mgmn_trtllm_bench.html
    +++ b/latest/examples/llm_mgmn_trtllm_bench.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Run trtllm-bench with pytorch backend on Slurm#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1#!/bin/bash
       2#SBATCH -A <account>
       3#SBATCH -p <partition>
    @@ -591,64 +593,63 @@
      68#      not supported in Slurm mode, you need to download the model and put it in
      69#      the LOCAL_MODEL directory.
      70
    - 71export prepare_dataset="$SOURCE_ROOT/benchmarks/cpp/prepare_dataset.py"
    - 72export data_path="$WORKDIR/token-norm-dist.txt"
    - 73
    - 74echo "Preparing dataset..."
    - 75srun -l \
    - 76    -N 1 \
    - 77    -n 1 \
    - 78    --container-image=${CONTAINER_IMAGE} \
    - 79    --container-name="prepare-name" \
    - 80    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
    - 81    --container-workdir=${WORKDIR} \
    - 82    --export=ALL \
    - 83    --mpi=pmix \
    - 84    bash -c "
    - 85        $PROLOGUE
    - 86        python3 $prepare_dataset \
    - 87            --tokenizer=$LOCAL_MODEL \
    - 88            --stdout token-norm-dist \
    - 89            --num-requests=100 \
    - 90            --input-mean=128 \
    - 91            --output-mean=128 \
    - 92            --input-stdev=0 \
    - 93            --output-stdev=0 > $data_path
    - 94    "
    - 95
    - 96echo "Running benchmark..."
    - 97# Just launch trtllm-bench job with trtllm-llmapi-launch command.
    - 98
    - 99srun -l \
    -100    --container-image=${CONTAINER_IMAGE} \
    -101    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
    -102    --container-workdir=${WORKDIR} \
    -103    --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
    -104    --mpi=pmix \
    -105    bash -c "
    -106        set -ex
    -107        $PROLOGUE
    -108        export PATH=$PATH:~/.local/bin
    -109
    -110        # This is optional
    -111        cat > /tmp/pytorch_extra_args.txt << EOF
    -112cuda_graph_config: null
    -113print_iter_log: true
    -114enable_attention_dp: false
    -115EOF
    -116
    -117        # launch the benchmark
    -118        trtllm-llmapi-launch \
    -119         trtllm-bench \
    -120            --model $MODEL_NAME \
    -121            --model_path $LOCAL_MODEL \
    -122            throughput \
    -123            --dataset $data_path \
    -124            --backend pytorch \
    -125            --tp 16 \
    -126            --extra_llm_api_options /tmp/pytorch_extra_args.txt \
    -127            $EXTRA_ARGS
    -128    "
    + 71export data_path="$WORKDIR/token-norm-dist.txt"
    + 72
    + 73echo "Preparing dataset..."
    + 74srun -l \
    + 75    -N 1 \
    + 76    -n 1 \
    + 77    --container-image=${CONTAINER_IMAGE} \
    + 78    --container-name="prepare-name" \
    + 79    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
    + 80    --container-workdir=${WORKDIR} \
    + 81    --export=ALL \
    + 82    --mpi=pmix \
    + 83    bash -c "
    + 84        $PROLOGUE
    + 85        trtllm-bench --model=$LOCAL_MODEL prepare-dataset \
    + 86            --output $data_path \
    + 87            token-norm-dist \
    + 88            --num-requests=100 \
    + 89            --input-mean=128 \
    + 90            --output-mean=128 \
    + 91            --input-stdev=0 \
    + 92            --output-stdev=0
    + 93    "
    + 94
    + 95echo "Running benchmark..."
    + 96# Just launch trtllm-bench job with trtllm-llmapi-launch command.
    + 97
    + 98srun -l \
    + 99    --container-image=${CONTAINER_IMAGE} \
    +100    --container-mounts=${MOUNT_DIR}:${MOUNT_DEST} \
    +101    --container-workdir=${WORKDIR} \
    +102    --export=ALL,PYTHONPATH=${SOURCE_ROOT} \
    +103    --mpi=pmix \
    +104    bash -c "
    +105        set -ex
    +106        $PROLOGUE
    +107        export PATH=$PATH:~/.local/bin
    +108
    +109        # This is optional
    +110        cat > /tmp/pytorch_extra_args.txt << EOF
    +111cuda_graph_config: null
    +112print_iter_log: true
    +113enable_attention_dp: false
    +114EOF
    +115
    +116        # launch the benchmark
    +117        trtllm-llmapi-launch \
    +118         trtllm-bench \
    +119            --model $MODEL_NAME \
    +120            --model_path $LOCAL_MODEL \
    +121            throughput \
    +122            --dataset $data_path \
    +123            --backend pytorch \
    +124            --tp 16 \
    +125            --extra_llm_api_options /tmp/pytorch_extra_args.txt \
    +126            $EXTRA_ARGS
    +127    "
     
    @@ -778,9 +779,9 @@ diff --git a/latest/examples/llm_mgmn_trtllm_serve.html b/latest/examples/llm_mgmn_trtllm_serve.html index 0b6c6af1d3..11ae15b211 100644 --- a/latest/examples/llm_mgmn_trtllm_serve.html +++ b/latest/examples/llm_mgmn_trtllm_serve.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Run trtllm-serve with pytorch backend on Slurm#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1#!/bin/bash
      2#SBATCH -A <account>
      3#SBATCH -p <partition>
    @@ -739,9 +741,9 @@
             
           
    diff --git a/latest/examples/llm_multilora.html b/latest/examples/llm_multilora.html
    index 2fdb853215..d49b224f7e 100644
    --- a/latest/examples/llm_multilora.html
    +++ b/latest/examples/llm_multilora.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Generate text with multiple LoRA adapters#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1
      2import argparse
      3from typing import Optional
    @@ -736,9 +738,9 @@
             
           
    diff --git a/latest/examples/llm_runtime.html b/latest/examples/llm_runtime.html
    index 68bf7a1b4a..1f28720681 100644
    --- a/latest/examples/llm_runtime.html
    +++ b/latest/examples/llm_runtime.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Runtime Configuration Examples#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1'''
       2This script demonstrates various runtime configuration options in TensorRT-LLM,
       3including KV cache management and CUDA graph optimizations.
    @@ -791,9 +793,9 @@
             
           
    diff --git a/latest/examples/llm_sampling.html b/latest/examples/llm_sampling.html
    index f17ed59f97..52068114fb 100644
    --- a/latest/examples/llm_sampling.html
    +++ b/latest/examples/llm_sampling.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Sampling Techniques Showcase#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1"""
       2This example demonstrates various sampling techniques available in TensorRT-LLM.
       3It showcases different sampling parameters and their effects on text generation.
    @@ -895,9 +897,9 @@
             
           
    diff --git a/latest/examples/llm_sparse_attention.html b/latest/examples/llm_sparse_attention.html
    index aee6319ae0..4b3756f140 100644
    --- a/latest/examples/llm_sparse_attention.html
    +++ b/latest/examples/llm_sparse_attention.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Sparse Attention#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1"""
       2This example demonstrates how to use sparse attention with TensorRT-LLM.
       3
    @@ -876,9 +878,9 @@
             
           
    diff --git a/latest/examples/llm_speculative_decoding.html b/latest/examples/llm_speculative_decoding.html
    index 5999d2696f..909c175355 100644
    --- a/latest/examples/llm_speculative_decoding.html
    +++ b/latest/examples/llm_speculative_decoding.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -520,7 +522,7 @@

    Speculative Decoding#

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1from typing import Optional
      2
      3import click
    @@ -742,9 +744,9 @@
             
           
    diff --git a/latest/examples/openai_chat_client.html b/latest/examples/openai_chat_client.html
    index c363cf2d6b..0aff0babcc 100644
    --- a/latest/examples/openai_chat_client.html
    +++ b/latest/examples/openai_chat_client.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    OpenAI Chat Client#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1
      2from openai import OpenAI
      3
    @@ -671,9 +673,9 @@
             
           
    diff --git a/latest/examples/openai_chat_client_for_multimodal.html b/latest/examples/openai_chat_client_for_multimodal.html
    index 965cea0fce..e7560a5d49 100644
    --- a/latest/examples/openai_chat_client_for_multimodal.html
    +++ b/latest/examples/openai_chat_client_for_multimodal.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    OpenAI Chat Client for Multimodal#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

      1
       2import os
       3from pathlib import Path
    @@ -779,9 +781,9 @@
             
           
    diff --git a/latest/examples/openai_completion_client.html b/latest/examples/openai_completion_client.html
    index 2755787a0b..3c283022b7 100644
    --- a/latest/examples/openai_completion_client.html
    +++ b/latest/examples/openai_completion_client.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    OpenAI Completion Client#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1
      2from openai import OpenAI
      3
    @@ -665,9 +667,9 @@
             
           
    diff --git a/latest/examples/openai_completion_client_for_lora.html b/latest/examples/openai_completion_client_for_lora.html
    index 4473b214fd..e2d461b39a 100644
    --- a/latest/examples/openai_completion_client_for_lora.html
    +++ b/latest/examples/openai_completion_client_for_lora.html
    @@ -61,7 +61,7 @@
         
    @@ -76,7 +76,7 @@
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    Openai Completion Client For Lora#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1### OpenAI Completion Client
      2
      3import os
    @@ -681,9 +683,9 @@
             
           
    diff --git a/latest/examples/openai_completion_client_json_schema.html b/latest/examples/openai_completion_client_json_schema.html
    index 829b5b1276..7803a28fb1 100644
    --- a/latest/examples/openai_completion_client_json_schema.html
    +++ b/latest/examples/openai_completion_client_json_schema.html
    @@ -61,7 +61,7 @@
         
    @@ -70,13 +70,13 @@
     
         
         
    -    
    +    
         
     
     
       
       
    -  
    +  
     
     
       
    @@ -356,6 +356,7 @@
     
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -521,7 +523,7 @@

    OpenAI Completion Client with JSON Schema#

    Refer to the trtllm-serve documentation for starting a server.

    -

    Source NVIDIA/TensorRT-LLM.

    +

    Source NVIDIA/TensorRT-LLM.

     1
      2# This example requires to specify `guided_decoding_backend` as
      3# `xgrammar` or `llguidance` in the extra_llm_api_options.yaml file.
    @@ -597,11 +599,11 @@
           

    next

    -

    Dynamo K8s Example

    +

    OpenAI Responses Client

    @@ -702,9 +704,9 @@ diff --git a/latest/examples/openai_responses_client.html b/latest/examples/openai_responses_client.html new file mode 100644 index 0000000000..f72a6dc5f1 --- /dev/null +++ b/latest/examples/openai_responses_client.html @@ -0,0 +1,684 @@ + + + + + + + + + + + + OpenAI Responses Client — TensorRT LLM + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + +
    + +
    + + + + + +
    +
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + +
    +
    + +
    +
    + +
    + +
    + + +
    + +
    + + +
    +
    + + + + + +
    + +
    +

    OpenAI Responses Client#

    +

    Refer to the trtllm-serve documentation for starting a server.

    +

    Source NVIDIA/TensorRT-LLM.

    +
     1
    + 2from openai import OpenAI
    + 3
    + 4client = OpenAI(
    + 5    base_url="http://localhost:8000/v1",
    + 6    api_key="tensorrt_llm",
    + 7)
    + 8
    + 9response = client.responses.create(
    +10    model="TinyLlama-1.1B-Chat-v1.0",
    +11    input="Where is New York?",
    +12    max_output_tokens=20,
    +13)
    +14print(response)
    +
    +
    +
    + + +
    + + + + + + + +
    + + + +
    + + + + + +
    +
    + +
    + +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/latest/examples/trtllm_serve_examples.html b/latest/examples/trtllm_serve_examples.html index e8875e5b4e..4137db46d7 100644 --- a/latest/examples/trtllm_serve_examples.html +++ b/latest/examples/trtllm_serve_examples.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -522,6 +524,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -530,6 +533,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • @@ -659,9 +663,9 @@ diff --git a/latest/features/additional-outputs.html b/latest/features/additional-outputs.html index 3c0979cb00..77c15e37e8 100644 --- a/latest/features/additional-outputs.html +++ b/latest/features/additional-outputs.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -729,9 +731,9 @@ and sequence.additi diff --git a/latest/features/attention.html b/latest/features/attention.html index de27b797f5..5b719bd665 100644 --- a/latest/features/attention.html +++ b/latest/features/attention.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -857,7 +859,7 @@ reach that point).

    the different requests by a cache manager during processing. That cache manager keeps track of the sequences, allocates new blocks from a pool and recycles those blocks when required. See the implementation of -KVCacheManager.

    +KVCacheManager.

    INT8/FP8 KV Caches#

    @@ -1148,9 +1150,9 @@ is computed as:

    diff --git a/latest/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.html b/latest/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.html index f3e35d27c3..9886c400ec 100644 --- a/latest/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.html +++ b/latest/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -785,9 +787,9 @@ diff --git a/latest/features/auto_deploy/advanced/example_run.html b/latest/features/auto_deploy/advanced/example_run.html index c149e5807e..006a450496 100644 --- a/latest/features/auto_deploy/advanced/example_run.html +++ b/latest/features/auto_deploy/advanced/example_run.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -354,6 +354,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -362,6 +363,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -701,9 +703,9 @@ python build_and_run_ad.py
    diff --git a/latest/features/auto_deploy/advanced/expert_configurations.html b/latest/features/auto_deploy/advanced/expert_configurations.html index 7138346d87..ee86ae03fd 100644 --- a/latest/features/auto_deploy/advanced/expert_configurations.html +++ b/latest/features/auto_deploy/advanced/expert_configurations.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -808,9 +810,9 @@ python build_and_run_ad.py
    diff --git a/latest/features/auto_deploy/advanced/logging.html b/latest/features/auto_deploy/advanced/logging.html index c6b535c6a4..64442e30d8 100644 --- a/latest/features/auto_deploy/advanced/logging.html +++ b/latest/features/auto_deploy/advanced/logging.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -354,6 +354,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -362,6 +363,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -634,9 +636,9 @@ decreasing verbosity;

    diff --git a/latest/features/auto_deploy/advanced/workflow.html b/latest/features/auto_deploy/advanced/workflow.html index 0c3c1f8ee7..cfb0e53626 100644 --- a/latest/features/auto_deploy/advanced/workflow.html +++ b/latest/features/auto_deploy/advanced/workflow.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -354,6 +354,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -362,6 +363,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -647,9 +649,9 @@ diff --git a/latest/features/auto_deploy/auto-deploy.html b/latest/features/auto_deploy/auto-deploy.html index 8b021873e0..baa7d7620a 100644 --- a/latest/features/auto_deploy/auto-deploy.html +++ b/latest/features/auto_deploy/auto-deploy.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -744,9 +746,9 @@ We welcome community contributions, see
    diff --git a/latest/features/auto_deploy/support_matrix.html b/latest/features/auto_deploy/support_matrix.html index fa195701c2..8ae5c83908 100644 --- a/latest/features/auto_deploy/support_matrix.html +++ b/latest/features/auto_deploy/support_matrix.html @@ -61,7 +61,7 @@ @@ -74,7 +74,7 @@ - + @@ -358,6 +358,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -366,6 +367,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -674,7 +676,7 @@ In addition, the following models have been officially validated using the defau

    Precision Support#

    -

    AutoDeploy supports models with various precision formats, including quantized checkpoints generated by TensorRT-Model-Optimizer.

    +

    AutoDeploy supports models with various precision formats, including quantized checkpoints generated by Model-Optimizer.

    Supported precision types include:

  • Dynamo K8s Example
  • @@ -1007,9 +1009,9 @@ Likewise, if the format shares some components with an already supported framewo diff --git a/latest/features/disagg-serving.html b/latest/features/disagg-serving.html index 25c40aba8f..bf3fdaa5b9 100644 --- a/latest/features/disagg-serving.html +++ b/latest/features/disagg-serving.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -906,9 +908,9 @@ when routing requests to the generation servers, the disaggregated server will m diff --git a/latest/features/feature-combination-matrix.html b/latest/features/feature-combination-matrix.html index 366b3364aa..501b1af7f5 100644 --- a/latest/features/feature-combination-matrix.html +++ b/latest/features/feature-combination-matrix.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -356,6 +356,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -364,6 +365,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -923,9 +925,9 @@ diff --git a/latest/features/guided-decoding.html b/latest/features/guided-decoding.html index 485f2d186c..592b0a7943 100644 --- a/latest/features/guided-decoding.html +++ b/latest/features/guided-decoding.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1233,9 +1235,9 @@ trtllm-serve nvidia/Llama-3.1-8B-Instruct-FP8
    diff --git a/latest/features/helix.html b/latest/features/helix.html index f12dfc95f1..a9031e708e 100644 --- a/latest/features/helix.html +++ b/latest/features/helix.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -775,9 +777,9 @@ Test name: TestDeep diff --git a/latest/features/kv-cache-connector.html b/latest/features/kv-cache-connector.html index 84af8d5995..f72dc08415 100644 --- a/latest/features/kv-cache-connector.html +++ b/latest/features/kv-cache-connector.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -818,9 +820,9 @@ diff --git a/latest/features/kvcache.html b/latest/features/kvcache.html index b5991d90de..70df8d464d 100644 --- a/latest/features/kvcache.html +++ b/latest/features/kvcache.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -755,9 +757,9 @@ diff --git a/latest/features/long-sequence.html b/latest/features/long-sequence.html index 56cdd11053..3d9801e624 100644 --- a/latest/features/long-sequence.html +++ b/latest/features/long-sequence.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -724,9 +726,9 @@ diff --git a/latest/features/lora.html b/latest/features/lora.html index dcf87e1b65..cf4ee8df68 100644 --- a/latest/features/lora.html +++ b/latest/features/lora.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -928,9 +930,9 @@ diff --git a/latest/features/multi-modality.html b/latest/features/multi-modality.html index c1d5236efa..5d65982044 100644 --- a/latest/features/multi-modality.html +++ b/latest/features/multi-modality.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -551,7 +553,7 @@

    The following examples demonstrate how to use TensorRT LLM’s multimodal support in various scenarios, including quick run examples, serving endpoints, and performance benchmarking.

    Quick start#

    -

    Quickly try out TensorRT LLM’s multimodal support using our LLM-API and a ready-to-run example:

    +

    Quickly try out TensorRT LLM’s multimodal support using our LLM-API and a ready-to-run example:

    python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --modality image --disable_kv_cache_reuse
     
    @@ -562,7 +564,7 @@
    trtllm-serve Qwen/Qwen2-VL-7B-Instruct  --backend pytorch
     
    -

    You can then send OpenAI-compatible requests, such as via curl or API clients, to the server endpoint. See curl chat client for multimodal script as an example.

    +

    You can then send OpenAI-compatible requests, such as via curl or API clients, to the server endpoint. See curl chat client for multimodal script as an example.

    Run with trtllm-bench#

    @@ -720,9 +722,9 @@ diff --git a/latest/features/overlap-scheduler.html b/latest/features/overlap-scheduler.html index 9a3b76ead9..f27fef3d85 100644 --- a/latest/features/overlap-scheduler.html +++ b/latest/features/overlap-scheduler.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -702,9 +704,9 @@ diff --git a/latest/features/paged-attention-ifb-scheduler.html b/latest/features/paged-attention-ifb-scheduler.html index c57904ceee..cfb6463920 100644 --- a/latest/features/paged-attention-ifb-scheduler.html +++ b/latest/features/paged-attention-ifb-scheduler.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -607,9 +609,9 @@ different types of KV caches: contiguous and pagedThe paged KV cache decomposes the KV cache into blocks that are distributed to the different requests by a cache manager during processing. That cache manager keeps track of the sequences, allocates new blocks from a pool, and recycles those blocks when required. See the simplified implementation of -tensorrt_llm.runtime.KVCacheManager. +tensorrt_llm.runtime.KVCacheManager. A more efficient C++ implementation is included in the -Batch Manager.

    +Batch Manager.

    @@ -798,9 +800,9 @@ A more efficient C++ implementation is included in the diff --git a/latest/features/parallel-strategy.html b/latest/features/parallel-strategy.html index f6b0172fb4..9c7357c127 100644 --- a/latest/features/parallel-strategy.html +++ b/latest/features/parallel-strategy.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -908,9 +910,9 @@ diff --git a/latest/features/quantization.html b/latest/features/quantization.html index a2ece9aad2..c09c2b24a4 100644 --- a/latest/features/quantization.html +++ b/latest/features/quantization.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -531,6 +533,7 @@
  • FP8 Block Scaling

  • FP8 Rowwise

  • FP8 KV Cache

  • +
  • NVFP4 KV Cache

  • W4A16 GPTQ

  • W4A8 GPTQ

  • W4A16 AWQ

  • @@ -542,7 +545,7 @@

    The default PyTorch backend supports FP4 and FP8 quantization on the latest Blackwell and Hopper GPUs.

    Running Pre-quantized Models#

    -

    TensorRT LLM can directly run pre-quantized models generated with the NVIDIA TensorRT Model Optimizer.

    +

    TensorRT LLM can directly run pre-quantized models generated with the NVIDIA Model Optimizer.

    from tensorrt_llm import LLM
     llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
     llm.generate("Hello, my name is")
    @@ -563,16 +566,38 @@
     
    +
    +

    NVFP4 KV Cache#

    +

    To enable NVFP4 KV cache, offline quantization with ModelOpt is required. Please follow the below section for instructions. +After the quantization is done, the NVFP4 KV cache option can be set by:

    +
    from tensorrt_llm import LLM
    +from tensorrt_llm.llmapi import KvCacheConfig
    +llm = LLM(model='/path/to/model',
    +          kv_cache_config=KvCacheConfig(dtype='nvfp4'))
    +llm.generate("Hello, my name is")
    +
    +
    +

    Offline Quantization with ModelOpt#

    If a pre-quantized model is not available on the Hugging Face Hub, you can quantize it offline using ModelOpt.

    Follow this step-by-step guide to quantize a model:

    -
    git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
    -cd TensorRT-Model-Optimizer/examples/llm_ptq
    -scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
    +
    git clone https://github.com/NVIDIA/Model-Optimizer.git
    +cd Model-Optimizer/examples/llm_ptq
    +scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8
     
    +
    +

    NVFP4 KV Cache#

    +

    To generate the checkpoint for NVFP4 KV cache:

    +
    git clone https://github.com/NVIDIA/Model-Optimizer.git
    +cd TensorRT-Model-Optimizer/examples/llm_ptq
    +scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --kv_cache_quant nvfp4
    +
    +
    +

    Note that currently TRT-LLM only supports FP8 weight/activation quantization when NVFP4 KV cache is enabled. Therefore, --quant fp8 is required here.

    +
    @@ -586,6 +611,7 @@ scripts/huggingface_example.sh --model

    FP8(block scaling)

    FP8(rowwise)

    FP8 KV Cache

    +

    NVFP4 KV Cache

    W4A8 AWQ

    W4A16 AWQ

    W4A8 GPTQ

    @@ -604,6 +630,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    DeepSeek-R1

    Y

    @@ -616,6 +643,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    EXAONE

    .

    @@ -624,6 +652,7 @@ scripts/huggingface_example.sh --model

    .

    .

    Y

    +

    .

    Y

    Y

    .

    @@ -636,6 +665,7 @@ scripts/huggingface_example.sh --model

    .

    .

    Y

    +

    .

    Y

    Y

    .

    @@ -652,6 +682,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    LLaMA

    Y

    @@ -661,6 +692,7 @@ scripts/huggingface_example.sh --model

    .

    Y

    .

    +

    .

    Y

    .

    Y

    @@ -674,6 +706,7 @@ scripts/huggingface_example.sh --model

    Y

    Y

    Y

    +

    Y

    .

    Y

    @@ -685,6 +718,7 @@ scripts/huggingface_example.sh --model

    Y

    Y

    Y

    +

    Y

    .

    .

    .

    @@ -700,6 +734,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    Mistral

    .

    @@ -709,6 +744,7 @@ scripts/huggingface_example.sh --model

    .

    Y

    .

    +

    .

    Y

    .

    .

    @@ -724,6 +760,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    Phi

    .

    @@ -732,6 +769,7 @@ scripts/huggingface_example.sh --model

    .

    .

    Y

    +

    .

    Y

    .

    .

    @@ -744,6 +782,7 @@ scripts/huggingface_example.sh --model

    .

    .

    Y

    +

    .

    Y

    Y

    .

    @@ -756,6 +795,7 @@ scripts/huggingface_example.sh --model

    .

    .

    Y

    +

    .

    Y

    Y

    .

    @@ -768,6 +808,7 @@ scripts/huggingface_example.sh --model

    .

    .

    Y

    +

    Y

    .

    Y

    .

    @@ -784,6 +825,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    BLIP2-T5

    .

    @@ -796,6 +838,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    LLaVA

    .

    @@ -805,6 +848,7 @@ scripts/huggingface_example.sh --model

    .

    Y

    .

    +

    .

    Y

    .

    Y

    @@ -817,6 +861,7 @@ scripts/huggingface_example.sh --model

    .

    Y

    .

    +

    .

    Y

    .

    Y

    @@ -832,6 +877,7 @@ scripts/huggingface_example.sh --model

    .

    .

    .

    +

    .

    @@ -853,6 +899,7 @@ The language component decides which quantization methods are supported by a giv

    FP8(block scaling)

    FP8(rowwise)

    FP8 KV Cache

    +

    NVFP4 KV Cache

    W4A8 AWQ

    W4A16 AWQ

    W4A8 GPTQ

    @@ -871,6 +918,7 @@ The language component decides which quantization methods are supported by a giv

    .

    .

    .

    +

    .

    Blackwell(sm100)

    Y

    @@ -879,6 +927,7 @@ The language component decides which quantization methods are supported by a giv

    Y

    .

    Y

    +

    Y

    .

    .

    .

    @@ -891,6 +940,7 @@ The language component decides which quantization methods are supported by a giv

    Y

    Y

    Y

    +

    .

    Y

    Y

    Y

    @@ -903,6 +953,7 @@ The language component decides which quantization methods are supported by a giv

    .

    .

    Y

    +

    .

    Y

    Y

    Y

    @@ -916,6 +967,7 @@ The language component decides which quantization methods are supported by a giv

    .

    Y

    .

    +

    .

    Y

    .

    Y

    @@ -932,7 +984,7 @@ The language component decides which quantization methods are supported by a giv

    Quick Links#

    @@ -990,9 +1042,13 @@ The language component decides which quantization methods are supported by a giv
  • Usage
  • Model Supported Matrix
  • @@ -1089,9 +1145,9 @@ The language component decides which quantization methods are supported by a giv diff --git a/latest/features/ray-orchestrator.html b/latest/features/ray-orchestrator.html index 5f983f6cc9..0f7800366f 100644 --- a/latest/features/ray-orchestrator.html +++ b/latest/features/ray-orchestrator.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -711,9 +713,9 @@ pip install -r
    diff --git a/latest/features/sampling.html b/latest/features/sampling.html index 6d884faa06..c83c1a3c01 100644 --- a/latest/features/sampling.html +++ b/latest/features/sampling.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -527,7 +529,7 @@

    To use the feature:

    1. Enable the enable_trtllm_sampler option in the LLM class

    2. -
    3. Pass a SamplingParams object with the desired options to the generate() function

    4. +
    5. Pass a SamplingParams object with the desired options to the generate() function

    The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:

    from tensorrt_llm import LLM, SamplingParams
    @@ -583,7 +585,7 @@
     

    Logits processors allow you to modify the logits produced by the network before sampling, enabling custom generation behavior and constraints.

    To use a custom logits processor:

      -
    1. Create a custom class that inherits from LogitsProcessor and implements the __call__ method

    2. +
    3. Create a custom class that inherits from LogitsProcessor and implements the __call__ method

    4. Pass an instance of this class to the logits_processor parameter of SamplingParams

    The following example demonstrates logits processing:

    @@ -611,7 +613,7 @@ llm.generate(["Hello, my name is"], sampling_params)
    -

    You can find a more detailed example on logits processors here.

    +

    You can find a more detailed example on logits processors here.

    @@ -758,9 +760,9 @@ diff --git a/latest/features/speculative-decoding.html b/latest/features/speculative-decoding.html index 48a6112c9e..02ae974cb5 100644 --- a/latest/features/speculative-decoding.html +++ b/latest/features/speculative-decoding.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -943,9 +945,9 @@ function. In practice, this is very cheap since the blocks are just marked as av diff --git a/latest/features/torch_compile_and_piecewise_cuda_graph.html b/latest/features/torch_compile_and_piecewise_cuda_graph.html index 67d9a26fb1..65be58087d 100644 --- a/latest/features/torch_compile_and_piecewise_cuda_graph.html +++ b/latest/features/torch_compile_and_piecewise_cuda_graph.html @@ -61,7 +61,7 @@ @@ -76,7 +76,7 @@ - + @@ -360,6 +360,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -368,6 +369,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1084,9 +1086,9 @@ diff --git a/latest/genindex.html b/latest/genindex.html index 3166add6e3..2cf4de2972 100644 --- a/latest/genindex.html +++ b/latest/genindex.html @@ -60,7 +60,7 @@ @@ -73,7 +73,7 @@ - + @@ -353,6 +353,7 @@
  • Curl Chat Client
  • Curl Chat Client For Multimodal
  • Curl Completion Client
  • +
  • Curl Responses Client
  • Deepseek R1 Reasoning Parser
  • Genai Perf Client
  • Genai Perf Client For Multimodal
  • @@ -361,6 +362,7 @@
  • OpenAI Completion Client
  • Openai Completion Client For Lora
  • OpenAI Completion Client with JSON Schema
  • +
  • OpenAI Responses Client
  • Dynamo K8s Example
  • @@ -1559,9 +1561,11 @@
  • (tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig attribute)
  • -
  • __init__() (tensorrt_llm.llmapi.AttentionDpConfig method) +
  • __init__() (tensorrt_llm.llmapi.AsyncLLM method) -