mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-05 02:31:33 +08:00
[None][test] modify ctx config in 128k8k disagg cases (#10779)
Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com> Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
parent
af49fbdf65
commit
4df0ca8bd1
@ -83,7 +83,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -78,12 +78,12 @@ worker_config:
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -87,7 +87,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -83,7 +83,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -78,12 +78,12 @@ worker_config:
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -83,7 +83,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -78,12 +78,12 @@ worker_config:
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -83,7 +83,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -86,7 +86,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -83,7 +83,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -87,7 +87,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -82,12 +82,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -83,7 +83,7 @@ worker_config:
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -78,12 +78,12 @@ worker_config:
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
num_nextn_predict_layers: 3
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -80,12 +80,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 32
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
ctx:
|
||||
max_batch_size: 32
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -81,12 +81,12 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -238,6 +238,7 @@ TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=
|
||||
"llama_v3.3_nemotron_super_49b_fp8",
|
||||
"llama_v3.1_nemotron_ultra_253b",
|
||||
"llama_v3.1_nemotron_ultra_253b_fp8",
|
||||
"kimi_k2_nvfp4",
|
||||
}
|
||||
|
||||
# Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user