[TRTLLM-9965][test] add long-context disagg test for GB300/GB200 and remove config_index in yaml (#10225)

Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com> Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-12-30 15:39:50 +08:00 · 2025-12-30 15:39:50 +08:00 · 0f4ed90560
commit 0f4ed90560
parent 692d8f2023
83 changed files with 3610 additions and 46 deletions
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 17
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 21
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 19
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 23
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 16
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 20
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 18
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 22
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -8,7 +8,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 21
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -8,7 +8,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 21
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 13
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 4
    moe_expert_parallel_size: 4
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '4'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 5
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 4
    moe_expert_parallel_size: 4
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 4
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 4
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml
@ -0,0 +1,105 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 6
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 4
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 7
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 8
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: false
    pipeline_parallel_size: 4
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 8
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 11
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 4
    moe_expert_parallel_size: 4
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 14
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 4
    moe_expert_parallel_size: 4
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml
@ -0,0 +1,102 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml
@ -0,0 +1,102 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '4'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 4
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 2
  ctx:
    max_batch_size: 4
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml
@ -0,0 +1,103 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml
@ -0,0 +1,103 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml
@ -0,0 +1,103 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 5
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml
@ -0,0 +1,103 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 7
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 4
    moe_expert_parallel_size: 4
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 2
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '1'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 7
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 1
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default.yaml
@ -0,0 +1,99 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '4'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 1
  num_gen_servers: 8
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 4
    moe_expert_parallel_size: 4
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 4
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 4
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml
@ -0,0 +1,105 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 2
  num_gen_servers: 7
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    allreduce_strategy: MNNVL
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '8'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 2
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 8
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 2
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '128'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 3
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 16
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 1
  ctx:
    max_batch_size: 16
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 1
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '16'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 3
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 16
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 16
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml
@ -0,0 +1,102 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '8'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 3
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 2
  ctx:
    max_batch_size: 8
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
@ -0,0 +1,102 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '2'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 3
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config: &id001
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config: *id001
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB200
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '4'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 3
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 4
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 4
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 8
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '256'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 5
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 16
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 16
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '128'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 5
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 8
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '64'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 5
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 2
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 2
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '128'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 5
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 4
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 4
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '256'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 7
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 16
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 1
  ctx:
    max_batch_size: 16
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 1
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '512'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 7
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 32
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 32
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '512'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 8
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 16
    moe_expert_parallel_size: 16
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 32
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 1
  ctx:
    max_batch_size: 32
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 1
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '128'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 8
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 4
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 4
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default.yaml
@ -0,0 +1,98 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '256'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 8
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
  ctx:
    max_batch_size: 8
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml
@ -0,0 +1,104 @@
 metadata:
  model_name: deepseek-r1-fp4
  precision: fp4
  model_dir_name: DeepSeek-R1-0528-FP4-v2
  supported_gpus:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 128k8k
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
  account: <account>
  job_time: 02:00:00
  job_name: unified-benchmark
  extra_args: --gres=gpu:4
  numa_bind: true
 benchmark:
  mode: e2e
  use_nv_sa_benchmark: true
  multi_round: 1
  benchmark_ratio: 0.8
  streaming: true
  concurrency_list: '256'
  input_length: 131072
  output_length: 8192
  dataset_file: <dataset_file>
 hardware:
  gpus_per_node: 4
  num_ctx_servers: 8
  num_gen_servers: 1
 environment:
  container_mount: <container_mount>
  container_image: <container_image>
  model_path: <model_path>
  trtllm_repo: ''
  build_wheel: false
  work_dir: <full_path_to_work_dir>
  worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
    TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
  server_env_var: TRTLLM_SERVER_DISABLE_GC=1
 profiling:
  nsys_on: false
 accuracy:
  enable_accuracy_test: false
 worker_config:
  gen:
    tensor_parallel_size: 32
    moe_expert_parallel_size: 32
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
    max_num_tokens: 128
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
    print_iter_log: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    moe_config:
      backend: TRTLLM
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
  ctx:
    max_batch_size: 8
    max_num_tokens: 131104
    max_seq_len: 131104
    tensor_parallel_size: 1
    moe_expert_parallel_size: 1
    enable_attention_dp: true
    pipeline_parallel_size: 4
    print_iter_log: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.4
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 131104
      backend: DEFAULT
    speculative_config:
      decoding_type: MTP
      num_nextn_predict_layers: 3
    moe_config:
      backend: TRTLLM
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 3
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 7
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 1
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 5
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 0
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 4
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 2
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 6
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 8
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 12
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 9
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 13
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 11
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 15
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 10
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 14
 slurm:
  script_file: disaggr_torch.slurm
  partition: <partition>
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 0
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
  accuracy:
    datasets:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 6
  dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
  accuracy:
    datasets:
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 8
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 11
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 10
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 13
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 9
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 12
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 1
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 3
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 0
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 2
  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@ -8,7 +8,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 7
  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 14
  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 5
  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 7
  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 4
  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 6
  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 1k1k
  config_index: 6
  dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL.yaml
@ -7,7 +7,6 @@ metadata:
  - GB300
  script_file: disaggr_torch.slurm
  benchmark_type: 8k1k
  config_index: 6
  dataset_file: disagg_datasets/kimi-k2-8192-1024-20000-ratio-1_for_serve.json
 slurm:
  script_file: disaggr_torch.slurm
--- a/tests/integration/defs/perf/disagg/testlist/all.txt
+++ b/tests/integration/defs/perf/disagg/testlist/all.txt
@ -26,6 +26,45 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
 # 128k8k GB300 (pp4) cases
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
 # 128k8k GB200 (pp8) cases
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
 # WIDEEP cases
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]
--- a/tests/integration/defs/perf/disagg/testlist/disagg.txt
+++ b/tests/integration/defs/perf/disagg/testlist/disagg.txt
@ -24,3 +24,42 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
 # 128k8k GB300 (pp4) cases
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
 # 128k8k GB200 (pp8) cases
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]