mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][test] restrict max_num_tokens in disagg mtp config (#10442)
Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com> Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
This commit is contained in:
parent
afa55c12b6
commit
d707286ca8
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
|
||||
max_num_tokens: 4
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
|
||||
max_num_tokens: 4
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
# mtp_size=2 ⇒ max_num_tokens = 4 * (2 + 1) = 12
|
||||
max_num_tokens: 12
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
|
||||
max_num_tokens: 4
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 6
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
# mtp_size=2 ⇒ max_num_tokens = 8 * (2 + 1) = 24
|
||||
max_num_tokens: 24
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 8 * (3 + 1) = 32
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
# mtp_size=1 ⇒ max_num_tokens = 16 * (1 + 1) = 32
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
# mtp_size=1 ⇒ max_num_tokens = 32 * (1 + 1) = 64
|
||||
max_num_tokens: 64
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 4 * (3 + 1) = 16
|
||||
max_num_tokens: 16
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 8 * (3 + 1) = 32
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
Loading…
Reference in New Issue
Block a user