mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-14 06:53:50 +08:00
162 lines
3.5 KiB
YAML
162 lines
3.5 KiB
YAML
meta-llama/Llama-2-7b-hf:
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 4096
|
|
max_num_tokens: 8192
|
|
meta-llama/Llama-2-70b-hf:
|
|
tp2_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 2048
|
|
tp4_pp1:
|
|
general:
|
|
max_batch_size: 4096
|
|
max_num_tokens: 8192
|
|
4224:
|
|
max_batch_size: 256
|
|
max_num_tokens: 8192
|
|
tp8_pp1:
|
|
general:
|
|
max_batch_size: 8192
|
|
max_num_tokens: 16384
|
|
2176:
|
|
max_batch_size: 1024
|
|
max_num_tokens: 16384
|
|
tiiuae/falcon-180B:
|
|
tp4_pp1:
|
|
general:
|
|
max_batch_size: 4096
|
|
max_num_tokens: 8192
|
|
tp8_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 8192
|
|
EleutherAI/gpt-j-6b:
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 128
|
|
max_num_tokens: 2048
|
|
256:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 2048
|
|
meta-llama/Meta-Llama-3-8B:
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 8192
|
|
meta-llama/Llama-3.1-8B: &llama_3_1_8b
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 8192
|
|
meta-llama/Meta-Llama-3-70B:
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 2048
|
|
tp2_pp1:
|
|
general:
|
|
max_batch_size: 256
|
|
max_num_tokens: 1024
|
|
4096:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 1024
|
|
tp4_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 1024
|
|
tp8_pp1:
|
|
general:
|
|
max_batch_size: 8192
|
|
max_num_tokens: 16384
|
|
meta-llama/Llama-3.1-70B: &llama_3_1_70b
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 2048
|
|
tp2_pp1:
|
|
general:
|
|
max_batch_size: 256
|
|
max_num_tokens: 1024
|
|
4096:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 1024
|
|
tp4_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 1024
|
|
tp8_pp1:
|
|
general:
|
|
max_batch_size: 8192
|
|
max_num_tokens: 16384
|
|
meta-llama/Llama-3.1-405B: &llama_3_1_405b
|
|
tp8_pp1:
|
|
general:
|
|
max_batch_size: 1024
|
|
max_num_tokens: 4096
|
|
256:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 4096
|
|
2000:
|
|
max_batch_size: 1280
|
|
max_num_tokens: 2560
|
|
2176:
|
|
max_batch_size: 1024
|
|
max_num_tokens: 4096
|
|
2500:
|
|
max_batch_size: 1024
|
|
max_num_tokens: 2048
|
|
4096:
|
|
max_batch_size: 512
|
|
max_num_tokens: 2048
|
|
4224:
|
|
max_batch_size: 512
|
|
max_num_tokens: 2048
|
|
5500:
|
|
max_batch_size: 512
|
|
max_num_tokens: 5120
|
|
22000:
|
|
max_batch_size: 128
|
|
max_num_tokens: 2048
|
|
mistralai/Mixtral-8x7B-v0.1: &mixtral_8x7b_0_1
|
|
tp2_pp1:
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 3072
|
|
tp4_pp1:
|
|
general:
|
|
max_batch_size: 8192
|
|
max_num_tokens: 8192
|
|
tp8_pp1:
|
|
general:
|
|
max_batch_size: 8192
|
|
max_num_tokens: 8192
|
|
mistralai/Mixtral-8x22B-v0.1: &mixtral_8x22b_0_1
|
|
tp8_pp1:
|
|
256:
|
|
max_batch_size: 8192
|
|
max_num_tokens: 16384
|
|
2176:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 16384
|
|
4224:
|
|
max_batch_size: 1024
|
|
max_num_tokens: 2048
|
|
5500:
|
|
max_batch_size: 1024
|
|
max_num_tokens: 8192
|
|
general:
|
|
max_batch_size: 2048
|
|
max_num_tokens: 8192
|
|
mistralai/Mistral-7B-v0.1:
|
|
tp1_pp1:
|
|
general:
|
|
max_batch_size: 4098
|
|
max_num_tokens: 8192
|
|
|
|
meta-llama/Llama-3.1-8B-Instruct: *llama_3_1_8b
|
|
meta-llama/Llama-3.1-70B-Instruct: *llama_3_1_70b
|
|
meta-llama/Llama-3.1-405B-Instruct: *llama_3_1_405b
|
|
mistralai/Mixtral-8x7B-Instruct-v0.1: *mixtral_8x7b_0_1
|
|
mistralai/Mixtral-8x22B-Instruct-v0.1: *mixtral_8x22b_0_1
|