server_configs: - name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4 model_name: deepseek_r1_0528_fp4_v2 gpus: 4 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4 model_name: deepseek_r1_0528_fp4_v2 gpus: 4 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con32_isl1024_osl1024 concurrency: 32 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4 model_name: deepseek_r1_0528_fp4_v2 gpus: 4 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: true print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS attention_dp_config: batching_wait_iters: 0 enable_balance: true timeout_iters: 60 tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 1344 max_seq_len: 2068 client_configs: - name: con256_isl1024_osl1024 concurrency: 256 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8 model_name: deepseek_r1_0528_fp4_v2 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8 model_name: deepseek_r1_0528_fp4_v2 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con32_isl1024_osl1024 concurrency: 32 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8 model_name: deepseek_r1_0528_fp4_v2 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: true print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS attention_dp_config: batching_wait_iters: 0 enable_balance: true timeout_iters: 60 tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 1344 max_seq_len: 2068 client_configs: - name: con256_isl1024_osl1024 concurrency: 256 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4 model_name: deepseek_r1_0528_fp4_v2 gpus: 4 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4 model_name: deepseek_r1_0528_fp4_v2 gpus: 4 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con32_isl8192_osl1024 concurrency: 32 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4 model_name: deepseek_r1_0528_fp4_v2 gpus: 4 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: true print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS attention_dp_config: batching_wait_iters: 0 enable_balance: true timeout_iters: 60 tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 8512 max_seq_len: 9416 client_configs: - name: con256_isl8192_osl1024 concurrency: 256 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8 model_name: deepseek_r1_0528_fp4_v2 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8 model_name: deepseek_r1_0528_fp4_v2 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con32_isl8192_osl1024 concurrency: 32 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8 model_name: deepseek_r1_0528_fp4_v2 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 512 enable_attention_dp: true print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: CUTLASS attention_dp_config: batching_wait_iters: 0 enable_balance: true timeout_iters: 60 tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 8512 max_seq_len: 9416 client_configs: - name: con256_isl8192_osl1024 concurrency: 256 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8 model_name: deepseek_r1_0528_fp8 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 256 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8 model_name: deepseek_r1_0528_fp8 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 256 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con16_isl1024_osl1024 concurrency: 16 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8 model_name: deepseek_r1_0528_fp8 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 256 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 1152 max_seq_len: 2068 client_configs: - name: con64_isl1024_osl1024 concurrency: 64 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8 model_name: deepseek_r1_0528_fp8 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 256 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8 model_name: deepseek_r1_0528_fp8 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 256 enable_attention_dp: false print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con16_isl8192_osl1024 concurrency: 16 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8 model_name: deepseek_r1_0528_fp8 gpus: 8 match_mode: scenario cuda_graph_config: enable_padding: true max_batch_size: 256 enable_attention_dp: true print_iter_log: true kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.8 enable_block_reuse: false stream_interval: 10 moe_config: backend: DEEPGEMM attention_dp_config: batching_wait_iters: 0 enable_balance: true timeout_iters: 60 tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 8320 max_seq_len: 9416 client_configs: - name: con64_isl8192_osl1024 concurrency: 64 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc4_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc16_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con16_isl1024_osl1024 concurrency: 16 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc64_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con64_isl1024_osl1024 concurrency: 64 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc4_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc16_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con16_isl1024_osl1024 concurrency: 16 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc64_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con64_isl1024_osl1024 concurrency: 64 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc4_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc16_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con16_isl1024_osl1024 concurrency: 16 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc64_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con64_isl1024_osl1024 concurrency: 64 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc4_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con4_isl1024_osl1024 concurrency: 4 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc16_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con16_isl1024_osl1024 concurrency: 16 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_1024_conc64_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 2068 client_configs: - name: con64_isl1024_osl1024 concurrency: 64 iterations: 10 isl: 1024 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc4_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl1024_osl8192 concurrency: 4 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc16_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl1024_osl8192 concurrency: 16 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc64_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl1024_osl8192 concurrency: 64 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc4_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl1024_osl8192 concurrency: 4 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc16_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl1024_osl8192 concurrency: 16 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc64_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl1024_osl8192 concurrency: 64 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc4_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl1024_osl8192 concurrency: 4 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc16_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl1024_osl8192 concurrency: 16 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc64_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl1024_osl8192 concurrency: 64 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc4_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl1024_osl8192 concurrency: 4 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc16_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl1024_osl8192 concurrency: 16 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_1024_8192_conc64_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl1024_osl8192 concurrency: 64 iterations: 10 isl: 1024 osl: 8192 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc4_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc16_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl8192_osl1024 concurrency: 16 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc64_gpu1 model_name: gpt_oss_120b_fp4 gpus: 1 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 1 moe_expert_parallel_size: 1 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl8192_osl1024 concurrency: 64 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc4_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc16_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl8192_osl1024 concurrency: 16 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc64_gpu2 model_name: gpt_oss_120b_fp4 gpus: 2 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 2 moe_expert_parallel_size: 2 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl8192_osl1024 concurrency: 64 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc4_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc16_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl8192_osl1024 concurrency: 16 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc64_gpu4 model_name: gpt_oss_120b_fp4 gpus: 4 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 4 moe_expert_parallel_size: 4 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl8192_osl1024 concurrency: 64 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc4_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 4 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con4_isl8192_osl1024 concurrency: 4 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc16_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 16 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con16_isl8192_osl1024 concurrency: 16 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true - name: openai_gpt_oss_120b_8192_1024_conc64_gpu8 model_name: gpt_oss_120b_fp4 gpus: 8 match_mode: scenario env_overrides: TRTLLM_ENABLE_PDL: 1 NCCL_GRAPH_REGISTER: 0 cuda_graph_config: enable_padding: true max_batch_size: 64 enable_attention_dp: false kv_cache_config: dtype: fp8 free_gpu_memory_fraction: 0.85 enable_block_reuse: false print_iter_log: true stream_interval: 20 num_postprocess_workers: 4 moe_config: backend: TRTLLM tensor_parallel_size: 8 moe_expert_parallel_size: 8 trust_remote_code: true backend: pytorch max_num_tokens: 20000 max_seq_len: 9236 client_configs: - name: con64_isl8192_osl1024 concurrency: 64 iterations: 10 isl: 8192 osl: 1024 random_range_ratio: 0.0 backend: openai streaming: true