[None][test] modify ctx config in 128k8k disagg cases (#10779)

Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com>
Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
ruodil 2026-01-19 14:10:19 +08:00 committed by Yanchao Lu
parent af49fbdf65
commit 4df0ca8bd1
35 changed files with 60 additions and 59 deletions

View File

@ -83,7 +83,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -78,12 +78,12 @@ worker_config:
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 4
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -87,7 +87,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -83,7 +83,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -78,12 +78,12 @@ worker_config:
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -83,7 +83,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -78,12 +78,12 @@ worker_config:
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -83,7 +83,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -86,7 +86,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 2
ctx:
max_batch_size: 4
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -83,7 +83,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -87,7 +87,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -82,12 +82,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 2
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -83,7 +83,7 @@ worker_config:
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -78,12 +78,12 @@ worker_config:
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 4
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
num_nextn_predict_layers: 3
allreduce_strategy: MNNVL
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -80,12 +80,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 16
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 16
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 2
ctx:
max_batch_size: 8
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 4
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 16
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 8
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 4
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 16
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 32
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 32
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 4
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 8
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -81,12 +81,12 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 8
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null

View File

@ -238,6 +238,7 @@ TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=
"llama_v3.3_nemotron_super_49b_fp8",
"llama_v3.1_nemotron_ultra_253b",
"llama_v3.1_nemotron_ultra_253b_fp8",
"kimi_k2_nvfp4",
}
# Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)