mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
chore: update doc by replacing use_cuda_graph with cuda_graph_config (#5680)
Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
This commit is contained in:
parent
32b244af38
commit
c434147366
@ -137,7 +137,7 @@ To do the benchmark, run the following command:
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
@ -317,7 +317,7 @@ To do the benchmark, run the following command:
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
@ -366,9 +366,9 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
|
||||
YOUR_DATA_PATH=./dataset.txt
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 128
|
||||
cuda_graph_config:
|
||||
batch_sizes:
|
||||
- 128
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
|
||||
|
||||
@ -123,7 +123,7 @@ To benchmark min-latency performance with MTP, you need to follow [this document
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
@ -178,7 +178,7 @@ To benchmark min-latency performance with MTP Relaxed Acceptance, you need to fo
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
|
||||
@ -541,7 +541,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
|
||||
```bash
|
||||
cat > ./extra_llm_api_options.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
EOF
|
||||
|
||||
trtllm-llmapi-launch \
|
||||
@ -622,7 +622,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
|
||||
```bash
|
||||
cat > ./extra_llm_api_options_eplb.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_load_balancer: ./moe_load_balancer.yaml
|
||||
EOF
|
||||
|
||||
|
||||
@ -189,9 +189,10 @@ def gen_config_file(config_path: str,
|
||||
'max_num_tokens': gen_max_num_tokens,
|
||||
'max_seq_len': 8576,
|
||||
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
|
||||
'use_cuda_graph': True,
|
||||
'cuda_graph_padding_enabled': True,
|
||||
'cuda_graph_batch_sizes': gen_cuda_graph_batch_sizes,
|
||||
'cuda_graph_config': {
|
||||
'padding_enabled': True,
|
||||
'batch_sizes': gen_cuda_graph_batch_sizes,
|
||||
},
|
||||
'print_iter_log': True,
|
||||
'kv_cache_dtype': 'fp8',
|
||||
'moe_backend': 'TRTLLM',
|
||||
|
||||
@ -58,12 +58,11 @@ since `TP` and `PP` is 1 for the two context and one generation server.
|
||||
The `disagg_config.yaml` file must now contain the configuration parameters of the context and generation servers. For example,
|
||||
it could look like:
|
||||
|
||||
```
|
||||
```yaml
|
||||
hostname: localhost
|
||||
port: 8000
|
||||
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
backend: "pytorch"
|
||||
use_cuda_graph: False
|
||||
disable_overlap_scheduler: True
|
||||
context_servers:
|
||||
num_instances: 2
|
||||
|
||||
@ -26,7 +26,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
|
||||
```bash
|
||||
cat > ./extra_llm_api_options.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_backend: WideEP
|
||||
moe_max_num_tokens: 8192
|
||||
EOF
|
||||
@ -116,7 +116,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
|
||||
```bash
|
||||
cat > ./extra_llm_api_options_eplb.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_backend: WideEP
|
||||
moe_max_num_tokens: 9216
|
||||
moe_load_balancer: ./moe_load_balancer.yaml
|
||||
@ -182,7 +182,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
|
||||
```bash
|
||||
cat > ./extra_llm_api_options_eplb.yaml <<EOF
|
||||
enable_attention_dp: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
moe_backend: WideEP
|
||||
moe_max_num_tokens: 9216
|
||||
moe_load_balancer: ./moe_load_balancer.yaml
|
||||
|
||||
@ -96,7 +96,7 @@ class TRTLLMEvalBase(TemplateLM):
|
||||
tp = torch.cuda.device_count()
|
||||
|
||||
pytorch_config_params = {
|
||||
"use_cuda_graph": use_cuda_graph,
|
||||
'cuda_graph_config': {} if use_cuda_graph else None,
|
||||
"print_iter_log": False,
|
||||
}
|
||||
if hasattr(PyTorchConfig, "moe_backend"):
|
||||
|
||||
@ -192,7 +192,7 @@ Evaluate the model accuracy using `trtllm-eval`.
|
||||
1. (Optional) Prepare an advanced configuration file:
|
||||
```bash
|
||||
cat >./extra-llm-api-config.yml <<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_config: {}
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
```
|
||||
|
||||
@ -692,7 +692,7 @@ concurrency=128
|
||||
path_data=./aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt
|
||||
|
||||
# Setup the extra configuration for llm-api
|
||||
echo -e "disable_overlap_scheduler: false\nuse_cuda_graph: true\nprint_iter_log: true\ncuda_graph_batch_sizes: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128]\nenable_attention_dp: true " > ${path_config}
|
||||
echo -e "disable_overlap_scheduler: false\cuda_graph_config: {}\nprint_iter_log: true\ncuda_graph_batch_sizes: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128]\nenable_attention_dp: true " > ${path_config}
|
||||
|
||||
# Run trtllm-bench with pytorch backend
|
||||
mpirun --allow-run-as-root --oversubscribe -n 1 \
|
||||
|
||||
Loading…
Reference in New Issue
Block a user