mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-9965][test] add long-context disagg test for GB300/GB200 and remove config_index in yaml (#10225)
Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com> Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
This commit is contained in:
parent
692d8f2023
commit
0f4ed90560
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 17
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 21
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 19
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 23
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 16
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 20
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 18
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 22
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -8,7 +8,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 21
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -8,7 +8,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 21
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 13
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '4'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 5
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 6
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 7
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 8
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 8
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 11
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 14
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,102 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,102 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '4'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,103 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,103 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,103 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 5
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,103 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 7
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '1'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 7
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,99 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '4'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 1
|
||||
num_gen_servers: 8
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,105 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 2
|
||||
num_gen_servers: 7
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
allreduce_strategy: MNNVL
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '8'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 2
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 2
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '128'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 3
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '16'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 3
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,102 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '8'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 3
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 2
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,102 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '2'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 3
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config: &id001
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config: *id001
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '4'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 3
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 5
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '128'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 5
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '64'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 5
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '128'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 5
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 7
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
ctx:
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '512'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 7
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '512'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 8
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
ctx:
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 1
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '128'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 8
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,98 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 8
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -0,0 +1,104 @@
|
||||
metadata:
|
||||
model_name: deepseek-r1-fp4
|
||||
precision: fp4
|
||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||
supported_gpus:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 128k8k
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
account: <account>
|
||||
job_time: 02:00:00
|
||||
job_name: unified-benchmark
|
||||
extra_args: --gres=gpu:4
|
||||
numa_bind: true
|
||||
benchmark:
|
||||
mode: e2e
|
||||
use_nv_sa_benchmark: true
|
||||
multi_round: 1
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '256'
|
||||
input_length: 131072
|
||||
output_length: 8192
|
||||
dataset_file: <dataset_file>
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
num_ctx_servers: 8
|
||||
num_gen_servers: 1
|
||||
environment:
|
||||
container_mount: <container_mount>
|
||||
container_image: <container_image>
|
||||
model_path: <model_path>
|
||||
trtllm_repo: ''
|
||||
build_wheel: false
|
||||
work_dir: <full_path_to_work_dir>
|
||||
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 4
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.4
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 131104
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 3
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 7
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 1
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 5
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 0
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 4
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 2
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 6
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 8
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 12
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 9
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 13
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 11
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 15
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 10
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 14
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
partition: <partition>
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 0
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
accuracy:
|
||||
datasets:
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 6
|
||||
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
|
||||
accuracy:
|
||||
datasets:
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 8
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 11
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 10
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 13
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 9
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 12
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 1
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 3
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 0
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 2
|
||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -8,7 +8,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 7
|
||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 14
|
||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 5
|
||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 7
|
||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 4
|
||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 6
|
||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 1k1k
|
||||
config_index: 6
|
||||
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -7,7 +7,6 @@ metadata:
|
||||
- GB300
|
||||
script_file: disaggr_torch.slurm
|
||||
benchmark_type: 8k1k
|
||||
config_index: 6
|
||||
dataset_file: disagg_datasets/kimi-k2-8192-1024-20000-ratio-1_for_serve.json
|
||||
slurm:
|
||||
script_file: disaggr_torch.slurm
|
||||
|
||||
@ -26,6 +26,45 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
|
||||
|
||||
# 128k8k GB300 (pp4) cases
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
|
||||
|
||||
# 128k8k GB200 (pp8) cases
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||
|
||||
|
||||
# WIDEEP cases
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]
|
||||
|
||||
@ -24,3 +24,42 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
|
||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
|
||||
|
||||
# 128k8k GB300 (pp4) cases
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
|
||||
|
||||
# 128k8k GB200 (pp8) cases
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user