TensorRT-LLMs/tensorrt_llm/bench/build/benchmark_config.yml
2024-11-05 16:27:06 +08:00

162 lines
3.5 KiB
YAML

meta-llama/Llama-2-7b-hf:
tp1_pp1:
general:
max_batch_size: 4096
max_num_tokens: 8192
meta-llama/Llama-2-70b-hf:
tp2_pp1:
general:
max_batch_size: 2048
max_num_tokens: 2048
tp4_pp1:
general:
max_batch_size: 4096
max_num_tokens: 8192
4224:
max_batch_size: 256
max_num_tokens: 8192
tp8_pp1:
general:
max_batch_size: 8192
max_num_tokens: 16384
2176:
max_batch_size: 1024
max_num_tokens: 16384
tiiuae/falcon-180B:
tp4_pp1:
general:
max_batch_size: 4096
max_num_tokens: 8192
tp8_pp1:
general:
max_batch_size: 2048
max_num_tokens: 8192
EleutherAI/gpt-j-6b:
tp1_pp1:
general:
max_batch_size: 128
max_num_tokens: 2048
256:
max_batch_size: 2048
max_num_tokens: 2048
meta-llama/Meta-Llama-3-8B:
tp1_pp1:
general:
max_batch_size: 2048
max_num_tokens: 8192
meta-llama/Llama-3.1-8B: &llama_3_1_8b
tp1_pp1:
general:
max_batch_size: 2048
max_num_tokens: 8192
meta-llama/Meta-Llama-3-70B:
tp1_pp1:
general:
max_batch_size: 2048
max_num_tokens: 2048
tp2_pp1:
general:
max_batch_size: 256
max_num_tokens: 1024
4096:
max_batch_size: 2048
max_num_tokens: 1024
tp4_pp1:
general:
max_batch_size: 2048
max_num_tokens: 1024
tp8_pp1:
general:
max_batch_size: 8192
max_num_tokens: 16384
meta-llama/Llama-3.1-70B: &llama_3_1_70b
tp1_pp1:
general:
max_batch_size: 2048
max_num_tokens: 2048
tp2_pp1:
general:
max_batch_size: 256
max_num_tokens: 1024
4096:
max_batch_size: 2048
max_num_tokens: 1024
tp4_pp1:
general:
max_batch_size: 2048
max_num_tokens: 1024
tp8_pp1:
general:
max_batch_size: 8192
max_num_tokens: 16384
meta-llama/Llama-3.1-405B: &llama_3_1_405b
tp8_pp1:
general:
max_batch_size: 1024
max_num_tokens: 4096
256:
max_batch_size: 2048
max_num_tokens: 4096
2000:
max_batch_size: 1280
max_num_tokens: 2560
2176:
max_batch_size: 1024
max_num_tokens: 4096
2500:
max_batch_size: 1024
max_num_tokens: 2048
4096:
max_batch_size: 512
max_num_tokens: 2048
4224:
max_batch_size: 512
max_num_tokens: 2048
5500:
max_batch_size: 512
max_num_tokens: 5120
22000:
max_batch_size: 128
max_num_tokens: 2048
mistralai/Mixtral-8x7B-v0.1: &mixtral_8x7b_0_1
tp2_pp1:
general:
max_batch_size: 2048
max_num_tokens: 3072
tp4_pp1:
general:
max_batch_size: 8192
max_num_tokens: 8192
tp8_pp1:
general:
max_batch_size: 8192
max_num_tokens: 8192
mistralai/Mixtral-8x22B-v0.1: &mixtral_8x22b_0_1
tp8_pp1:
256:
max_batch_size: 8192
max_num_tokens: 16384
2176:
max_batch_size: 2048
max_num_tokens: 16384
4224:
max_batch_size: 1024
max_num_tokens: 2048
5500:
max_batch_size: 1024
max_num_tokens: 8192
general:
max_batch_size: 2048
max_num_tokens: 8192
mistralai/Mistral-7B-v0.1:
tp1_pp1:
general:
max_batch_size: 4098
max_num_tokens: 8192
meta-llama/Llama-3.1-8B-Instruct: *llama_3_1_8b
meta-llama/Llama-3.1-70B-Instruct: *llama_3_1_70b
meta-llama/Llama-3.1-405B-Instruct: *llama_3_1_405b
mistralai/Mixtral-8x7B-Instruct-v0.1: *mixtral_8x7b_0_1
mistralai/Mixtral-8x22B-Instruct-v0.1: *mixtral_8x22b_0_1