[TRTLLM-9965][test] add long-context disagg test for GB300/GB200 and remove config_index in yaml (#10225)

Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com>
Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
This commit is contained in:
ruodil 2025-12-30 15:39:50 +08:00 committed by GitHub
parent 692d8f2023
commit 0f4ed90560
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
83 changed files with 3610 additions and 46 deletions

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 17
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 21
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 19
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 23
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 16
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 20
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 18
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 22
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -8,7 +8,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 21
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -8,7 +8,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 21
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 13
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '4'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 5
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 4
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 6
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 4
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 7
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 8
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
pipeline_parallel_size: 4
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 8
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 11
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 14
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,102 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,102 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '4'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 2
ctx:
max_batch_size: 4
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,103 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,103 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,103 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 5
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,103 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 7
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 2
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 7
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '4'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 8
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
allreduce_strategy: MNNVL
ctx:
max_batch_size: 4
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 2
num_gen_servers: 7
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: false
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
allreduce_strategy: MNNVL
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '8'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 2
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 8
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 2
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '128'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 3
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 16
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 16
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '16'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 3
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 16
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 16
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,102 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '8'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 3
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 2
ctx:
max_batch_size: 8
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,102 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '2'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 3
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config: *id001
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '4'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 3
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 4
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '256'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 5
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 16
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 16
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '128'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 5
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 8
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '64'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 5
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 2
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '128'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 5
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 4
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '256'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 7
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 16
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 16
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '512'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 7
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 32
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 32
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '512'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 8
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 32
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 32
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '128'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 8
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 4
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,98 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '256'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 8
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 8
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
moe_config:
backend: TRTLLM

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek-r1-fp4
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '256'
input_length: 131072
output_length: 8192
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 8
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 128
max_seq_len: 139296
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 8
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
pipeline_parallel_size: 4
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.4
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
moe_config:
backend: TRTLLM

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 3
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 7
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 1
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 5
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 0
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 4
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 2
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 6
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 8
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 12
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 9
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 13
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 11
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 15
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 10
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 14
slurm:
script_file: disaggr_torch.slurm
partition: <partition>

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 0
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
accuracy:
datasets:

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 6
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
accuracy:
datasets:

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 8
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 11
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 10
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 13
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 9
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 12
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 1
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 3
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 0
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 2
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -8,7 +8,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 7
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 14
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 5
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 7
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 4
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 6
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: 6
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -7,7 +7,6 @@ metadata:
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
config_index: 6
dataset_file: disagg_datasets/kimi-k2-8192-1024-20000-ratio-1_for_serve.json
slurm:
script_file: disaggr_torch.slurm

View File

@ -26,6 +26,45 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
# 128k8k GB300 (pp4) cases
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
# 128k8k GB200 (pp8) cases
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
# WIDEEP cases
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]

View File

@ -24,3 +24,42 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX]
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
# 128k8k GB300 (pp4) cases
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
# 128k8k GB200 (pp8) cases
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]