chore: update doc by replacing use_cuda_graph with cuda_graph_config (#5680)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
This commit is contained in:
nv-guomingz 2025-07-04 14:39:15 +08:00 committed by GitHub
parent 32b244af38
commit c434147366
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 20 additions and 20 deletions

View File

@ -137,7 +137,7 @@ To do the benchmark, run the following command:
YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
use_cuda_graph: true
cuda_graph_config: {}
moe_backend: TRTLLM
speculative_config:
decoding_type: MTP
@ -317,7 +317,7 @@ To do the benchmark, run the following command:
YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
use_cuda_graph: true
cuda_graph_config: {}
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
@ -366,9 +366,9 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
YOUR_DATA_PATH=./dataset.txt
cat >./extra-llm-api-config.yml<<EOF
use_cuda_graph: true
cuda_graph_batch_sizes:
- 128
cuda_graph_config:
batch_sizes:
- 128
enable_attention_dp: true
EOF

View File

@ -123,7 +123,7 @@ To benchmark min-latency performance with MTP, you need to follow [this document
YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
use_cuda_graph: true
cuda_graph_config: {}
moe_backend: TRTLLM
speculative_config:
decoding_type: MTP
@ -178,7 +178,7 @@ To benchmark min-latency performance with MTP Relaxed Acceptance, you need to fo
YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
use_cuda_graph: true
cuda_graph_config: {}
moe_backend: TRTLLM
speculative_config:
decoding_type: MTP

View File

@ -541,7 +541,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
```bash
cat > ./extra_llm_api_options.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
cuda_graph_config: {}
EOF
trtllm-llmapi-launch \
@ -622,7 +622,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
```bash
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
cuda_graph_config: {}
moe_load_balancer: ./moe_load_balancer.yaml
EOF

View File

@ -189,9 +189,10 @@ def gen_config_file(config_path: str,
'max_num_tokens': gen_max_num_tokens,
'max_seq_len': 8576,
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
'use_cuda_graph': True,
'cuda_graph_padding_enabled': True,
'cuda_graph_batch_sizes': gen_cuda_graph_batch_sizes,
'cuda_graph_config': {
'padding_enabled': True,
'batch_sizes': gen_cuda_graph_batch_sizes,
},
'print_iter_log': True,
'kv_cache_dtype': 'fp8',
'moe_backend': 'TRTLLM',

View File

@ -58,12 +58,11 @@ since `TP` and `PP` is 1 for the two context and one generation server.
The `disagg_config.yaml` file must now contain the configuration parameters of the context and generation servers. For example,
it could look like:
```
```yaml
hostname: localhost
port: 8000
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
backend: "pytorch"
use_cuda_graph: False
disable_overlap_scheduler: True
context_servers:
num_instances: 2

View File

@ -26,7 +26,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
```bash
cat > ./extra_llm_api_options.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
cuda_graph_config: {}
moe_backend: WideEP
moe_max_num_tokens: 8192
EOF
@ -116,7 +116,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
```bash
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
cuda_graph_config: {}
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
@ -182,7 +182,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
```bash
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
cuda_graph_config: {}
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml

View File

@ -96,7 +96,7 @@ class TRTLLMEvalBase(TemplateLM):
tp = torch.cuda.device_count()
pytorch_config_params = {
"use_cuda_graph": use_cuda_graph,
'cuda_graph_config': {} if use_cuda_graph else None,
"print_iter_log": False,
}
if hasattr(PyTorchConfig, "moe_backend"):

View File

@ -192,7 +192,7 @@ Evaluate the model accuracy using `trtllm-eval`.
1. (Optional) Prepare an advanced configuration file:
```bash
cat >./extra-llm-api-config.yml <<EOF
use_cuda_graph: true
cuda_graph_config: {}
enable_attention_dp: true
EOF
```

View File

@ -692,7 +692,7 @@ concurrency=128
path_data=./aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt
# Setup the extra configuration for llm-api
echo -e "disable_overlap_scheduler: false\nuse_cuda_graph: true\nprint_iter_log: true\ncuda_graph_batch_sizes: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128]\nenable_attention_dp: true " > ${path_config}
echo -e "disable_overlap_scheduler: false\cuda_graph_config: {}\nprint_iter_log: true\ncuda_graph_batch_sizes: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128]\nenable_attention_dp: true " > ${path_config}
# Run trtllm-bench with pytorch backend
mpirun --allow-run-as-root --oversubscribe -n 1 \