mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-9965][test] add long-context disagg test for GB300/GB200 and remove config_index in yaml (#10225)
Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com> Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
This commit is contained in:
parent
692d8f2023
commit
0f4ed90560
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 17
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 21
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 19
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 23
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 16
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 20
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 18
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 22
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -8,7 +8,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 21
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -8,7 +8,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 21
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 13
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '4'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 5
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,105 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 6
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 7
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 8
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 8
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 11
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 14
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,102 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,102 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '4'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 2
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 5
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 7
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 2
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 7
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '4'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 8
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,105 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 2
|
||||||
|
num_gen_servers: 7
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: false
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
allreduce_strategy: MNNVL
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '8'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 2
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 2
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '128'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 3
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '16'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 3
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,102 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '8'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 3
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 2
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,102 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '2'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 3
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config: &id001
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config: *id001
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '4'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 3
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 8
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '256'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 5
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '128'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 5
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '64'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 5
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '128'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 5
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '256'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 7
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '512'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 7
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 32
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 32
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '512'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 8
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 16
|
||||||
|
moe_expert_parallel_size: 16
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 32
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 32
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '128'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 8
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '256'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 8
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -0,0 +1,104 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek-r1-fp4
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 128k8k
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: --gres=gpu:4
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '256'
|
||||||
|
input_length: 131072
|
||||||
|
output_length: 8192
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 8
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1
|
||||||
|
TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
|
||||||
|
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 128
|
||||||
|
max_seq_len: 139296
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 131104
|
||||||
|
max_seq_len: 131104
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 4
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.4
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 131104
|
||||||
|
backend: DEFAULT
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
moe_config:
|
||||||
|
backend: TRTLLM
|
||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 3
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 7
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 1
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 5
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 0
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 4
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 2
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 6
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 8
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 12
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 9
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 13
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 11
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 15
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 10
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 14
|
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
partition: <partition>
|
partition: <partition>
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 0
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
accuracy:
|
accuracy:
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 6
|
|
||||||
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
|
||||||
accuracy:
|
accuracy:
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 8
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 11
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 10
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 13
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 9
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 12
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 1
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 3
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 0
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 2
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -8,7 +8,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 7
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 14
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 5
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 7
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 4
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 6
|
|
||||||
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 1k1k
|
benchmark_type: 1k1k
|
||||||
config_index: 6
|
|
||||||
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/kimi-k2-1024-1024-100000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -7,7 +7,6 @@ metadata:
|
|||||||
- GB300
|
- GB300
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
benchmark_type: 8k1k
|
benchmark_type: 8k1k
|
||||||
config_index: 6
|
|
||||||
dataset_file: disagg_datasets/kimi-k2-8192-1024-20000-ratio-1_for_serve.json
|
dataset_file: disagg_datasets/kimi-k2-8192-1024-20000-ratio-1_for_serve.json
|
||||||
slurm:
|
slurm:
|
||||||
script_file: disaggr_torch.slurm
|
script_file: disaggr_torch.slurm
|
||||||
|
|||||||
@ -26,6 +26,45 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
|
|||||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
|
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
|
||||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
|
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
|
||||||
|
|
||||||
|
# 128k8k GB300 (pp4) cases
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
|
||||||
|
|
||||||
|
# 128k8k GB200 (pp8) cases
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||||
|
|
||||||
|
|
||||||
# WIDEEP cases
|
# WIDEEP cases
|
||||||
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]
|
||||||
|
|||||||
@ -24,3 +24,42 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-
|
|||||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX]
|
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX]
|
||||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
|
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX]
|
||||||
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
|
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL]
|
||||||
|
|
||||||
|
# 128k8k GB300 (pp4) cases
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default]
|
||||||
|
|
||||||
|
# 128k8k GB200 (pp8) cases
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
|
||||||
|
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user