Merge c30aebffc2 into 6df2c8a074

2026-01-13 22:18:36 +08:00 · 2026-01-13 21:25:08 +08:00 · 2026-01-13 21:25:08 +08:00 · a580db94d9
commit a580db94d9
parent 6df2c8a074 c30aebffc2
8 changed files with 22 additions and 17 deletions
--- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
@ -155,9 +155,9 @@ These optimizations target the overall execution flow, scheduling, and resource

    CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed.

-    There is a feature called CUDA Graph padding in TensorRT LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
+    There is a feature called CUDA Graph padding in TensorRT LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. The CUDA Graph padding feature is enabled by default to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.

-    Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n  enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
+    Users can opt-out the CUDA Graph padding feature to see if disabling provides perf benefits for their specific workload, by setting the `cuda_graph_config:\n  enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)

 * Overlap Scheduler:

--- a/examples/llm-api/llm_sparse_attention.py
+++ b/examples/llm-api/llm_sparse_attention.py
@ -125,9 +125,9 @@ def parse_arguments():
                        default=False,
                        action='store_true',
                        help='Print iteration logs during execution')
-    parser.add_argument('--use_cuda_graph', default=False, action='store_true')
+    parser.add_argument('--use_cuda_graph', default=True, action='store_true')
    parser.add_argument('--cuda_graph_padding_enabled',
-                        default=False,
+                        default=True,
                        action='store_true')
    parser.add_argument('--cuda_graph_batch_sizes',
                        nargs='+',
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@ -108,9 +108,9 @@ def add_llm_args(parser):
    parser.add_argument('--enable_chunked_prefill',
                        default=False,
                        action='store_true')
-    parser.add_argument('--use_cuda_graph', default=False, action='store_true')
+    parser.add_argument('--use_cuda_graph', default=True, action='store_true')
    parser.add_argument('--cuda_graph_padding_enabled',
-                        default=False,
+                        default=True,
                        action='store_true')
    parser.add_argument('--cuda_graph_batch_sizes',
                        nargs='+',
--- a/examples/longbench/eval_longbench_v1.py
+++ b/examples/longbench/eval_longbench_v1.py
@ -176,9 +176,9 @@ def parse_arguments() -> argparse.Namespace:
                        default=False,
                        action='store_true',
                        help='Print iteration logs during execution')
-    parser.add_argument('--use_cuda_graph', default=False, action='store_true')
+    parser.add_argument('--use_cuda_graph', default=True, action='store_true')
    parser.add_argument('--cuda_graph_padding_enabled',
-                        default=False,
+                        default=True,
                        action='store_true')
    parser.add_argument('--cuda_graph_batch_sizes',
                        nargs='+',
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@ -117,7 +117,7 @@ class CudaGraphConfig(StrictBaseModel):
        default=0, description="Maximum batch size for CUDA graphs.")

    enable_padding: bool = Field(
-        default=False,
+        default=True,
        description=
        "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
    )
@ -148,10 +148,16 @@ class CudaGraphConfig(StrictBaseModel):
        else:
            batch_sizes = list(range(1, 32)) + [32, 64, 128]

-        # Add powers of 2 up to max_batch_size
-        batch_sizes += [
-            2**i for i in range(8, math.ceil(math.log(max_batch_size, 2)))
-        ]
+        if enable_padding:
+            # Use tile size of 64 when padding is enabled (more finer granularity)
+            batch_sizes += [
+                i * 64 for i in range(3, (max_batch_size // 64) + 1)
+            ]
+        else:
+            # Add powers of 2 up to max_batch_size
+            batch_sizes += [
+                2**i for i in range(8, math.ceil(math.log(max_batch_size, 2)))
+            ]

        # Filter and sort batch sizes
        batch_sizes = sorted(
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@ -19,7 +19,7 @@ generation_servers:
  pipeline_parallel_size: 1
  enable_attention_dp: true
  cuda_graph_config:
-    enable_padding: False
+    enable_padding: True
  disable_overlap_scheduler: False
  cache_transceiver_config:
    backend: DEFAULT
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@ -17,7 +17,7 @@ generation_servers:
  tensor_parallel_size: 2
  pipeline_parallel_size: 1
  cuda_graph_config:
-    enable_padding: False
+    enable_padding: True
  disable_overlap_scheduler: False
  cache_transceiver_config:
    backend: DEFAULT
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@ -141,8 +141,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
    kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                    max_tokens=8192)
    cuda_graph_config = CudaGraphConfig(
-        batch_sizes=[i for i in range(1, max_batch_size +
-                                      1)]) if use_cuda_graph else None
+        max_batch_size=max_batch_size) if use_cuda_graph else None

    llm_common_config = dict(
        model=target_model_dir,