TensorRT-LLMs/tests/scripts/perf-sanity/config_database_h200_sxm.yaml
2026-01-12 10:55:07 -08:00

1416 lines
32 KiB
YAML

server_configs:
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
enable_padding: true
max_batch_size: 128
enable_attention_dp: false
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.75
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: CUTLASS
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 1152
max_seq_len: 2068
client_configs:
- name: con4_isl1024_osl1024
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
enable_padding: true
max_batch_size: 128
enable_attention_dp: false
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.75
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: CUTLASS
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 1152
max_seq_len: 2068
client_configs:
- name: con16_isl1024_osl1024
concurrency: 16
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
enable_padding: true
max_batch_size: 128
enable_attention_dp: false
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.75
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: CUTLASS
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 1152
max_seq_len: 2068
client_configs:
- name: con64_isl1024_osl1024
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
enable_padding: true
max_batch_size: 128
enable_attention_dp: false
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.75
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: CUTLASS
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 8320
max_seq_len: 9416
client_configs:
- name: con4_isl8192_osl1024
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
enable_padding: true
max_batch_size: 128
enable_attention_dp: false
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.75
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: CUTLASS
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 8320
max_seq_len: 9416
client_configs:
- name: con16_isl8192_osl1024
concurrency: 16
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
enable_padding: true
max_batch_size: 128
enable_attention_dp: true
print_iter_log: true
kv_cache_config:
dtype: fp8
free_gpu_memory_fraction: 0.75
enable_block_reuse: false
stream_interval: 10
moe_config:
backend: CUTLASS
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 8320
max_seq_len: 9416
client_configs:
- name: con64_isl8192_osl1024
concurrency: 64
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con4_isl1024_osl1024
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con16_isl1024_osl1024
concurrency: 16
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con64_isl1024_osl1024
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con4_isl1024_osl1024
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con16_isl1024_osl1024
concurrency: 16
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con64_isl1024_osl1024
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con4_isl1024_osl1024
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con16_isl1024_osl1024
concurrency: 16
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con64_isl1024_osl1024
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con4_isl1024_osl1024
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con16_isl1024_osl1024
concurrency: 16
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 2068
client_configs:
- name: con64_isl1024_osl1024
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl1024_osl8192
concurrency: 4
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl1024_osl8192
concurrency: 16
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl1024_osl8192
concurrency: 64
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl1024_osl8192
concurrency: 4
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl1024_osl8192
concurrency: 16
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl1024_osl8192
concurrency: 64
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl1024_osl8192
concurrency: 4
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl1024_osl8192
concurrency: 16
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl1024_osl8192
concurrency: 64
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl1024_osl8192
concurrency: 4
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl1024_osl8192
concurrency: 16
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl1024_osl8192
concurrency: 64
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl8192_osl1024
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl8192_osl1024
concurrency: 16
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 1
moe_expert_parallel_size: 1
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl8192_osl1024
concurrency: 64
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl8192_osl1024
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl8192_osl1024
concurrency: 16
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 2
moe_expert_parallel_size: 2
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl8192_osl1024
concurrency: 64
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl8192_osl1024
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl8192_osl1024
concurrency: 16
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 4
moe_expert_parallel_size: 4
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl8192_osl1024
concurrency: 64
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 4
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con4_isl8192_osl1024
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 16
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con16_isl8192_osl1024
concurrency: 16
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
TRTLLM_ENABLE_PDL: 1
cuda_graph_config:
enable_padding: true
max_batch_size: 64
enable_attention_dp: false
kv_cache_config:
dtype: auto
free_gpu_memory_fraction: 0.85
enable_block_reuse: false
moe_config:
backend: TRITON
num_postprocess_workers: 4
print_iter_log: true
stream_interval: 20
tensor_parallel_size: 8
moe_expert_parallel_size: 8
trust_remote_code: true
backend: pytorch
max_num_tokens: 20000
max_seq_len: 9236
client_configs:
- name: con64_isl8192_osl1024
concurrency: 64
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.0
backend: openai
streaming: true