mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-06 03:01:50 +08:00
* Move TRT-LLM backend repo to TRT-LLM repo Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Address review comments Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * debug ci Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Update triton backend Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Fixes after update Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> --------- Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
703 lines
22 KiB
Bash
Executable File
703 lines
22 KiB
Bash
Executable File
#!/usr/bin/bash
|
|
|
|
install_requirements() {
|
|
pip install -r requirements.txt
|
|
#This is to WAR an issue with latest transformers package not being compatible with sentencepiece 0.1.99 apparently
|
|
pip install sentencepiece --upgrade
|
|
}
|
|
|
|
MODEL=$1
|
|
|
|
GPT2=$LLM_MODELS_ROOT/gpt2
|
|
GPT2_MEDIUM=$LLM_MODELS_ROOT/gpt2-medium
|
|
GPT2_NEXT_PTUNING=$LLM_MODELS_ROOT/email_composition
|
|
OPT_125M=$LLM_MODELS_ROOT/opt-125m
|
|
LLAMA=$LLM_MODELS_ROOT/llama-models/llama-7b-hf
|
|
GPTJ=$LLM_MODELS_ROOT/gpt-j-6b
|
|
MISTRAL=$LLM_MODELS_ROOT/mistral-7b-v0.1
|
|
GPT_2B=$LLM_MODELS_ROOT/GPT-2B-001_bf16_tp1.nemo
|
|
GPT_2B_LORA=$LLM_MODELS_ROOT/lora/gpt-next-2b
|
|
VICUNA=$LLM_MODELS_ROOT/vicuna-7b-v1.3
|
|
MEDUSA_VICUNA=$LLM_MODELS_ROOT/medusa-vicuna-7b-v1.3/
|
|
EAGLE_VICUNA=$LLM_MODELS_ROOT/EAGLE-Vicuna-7B-v1.3/
|
|
BART=$LLM_MODELS_ROOT/bart-large-cnn/
|
|
T5=$LLM_MODELS_ROOT/t5-small/
|
|
BLIP2_OPT_2_7B=$LLM_MODELS_ROOT/blip2-opt-2.7b
|
|
LLAVA_7B=$LLM_MODELS_ROOT/llava-1.5-7b-hf
|
|
VILA1_5_3B=$LLM_MODELS_ROOT/vila/VILA1.5-3b
|
|
VILA_PATH=$LLM_MODELS_ROOT/vila/VILA
|
|
LLAMA_3_2_11B_VISION=$LLM_MODELS_ROOT/llama-3.2-models/Llama-3.2-11B-Vision-Instruct
|
|
WHISPER_LAREGE_V3=$LLM_MODELS_ROOT/whisper-models/large-v3
|
|
LLAVA_ONEVISION_7B=$LLM_MODELS_ROOT/llava-onevision-qwen2-7b-ov-hf
|
|
QWEN2_VL_7B=$LLM_MODELS_ROOT/Qwen2-VL-7B-Instruct
|
|
set -e
|
|
|
|
pkill -9 -f tritonserver || true
|
|
|
|
pushd $LLM_ROOT/
|
|
|
|
# install deps
|
|
pip3 install -r requirements-dev.txt
|
|
|
|
if [ "$MODEL" = "gpt" ] || [ "$MODEL" = "gpt-disaggregated-serving-bls" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT from HF"
|
|
python3 convert_checkpoint.py --model_dir ${GPT2} --dtype float16 --output_dir ./c-model/gpt2/fp16
|
|
|
|
echo "Build GPT: float16 | remove_input_padding"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--gemm_plugin float16 \
|
|
--context_fmha enable \
|
|
--remove_input_padding enable \
|
|
--max_batch_size 8 \
|
|
--max_seq_len 1024 \
|
|
--max_num_tokens 7392 \
|
|
--output_dir trt_engine/gpt2/fp16/1-gpu/
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "opt" ]; then
|
|
|
|
pushd examples/models/contrib/opt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert OPT from HF"
|
|
python3 convert_checkpoint.py --model_dir ${OPT_125M} --dtype float16 --output_dir ./c-model/opt-125m/fp16
|
|
|
|
echo "OPT builder"
|
|
trtllm-build --checkpoint_dir ./c-model/opt-125m/fp16 \
|
|
--gemm_plugin float16 \
|
|
--gpt_attention_plugin float16 \
|
|
--context_fmha=enable \
|
|
--max_batch_size 8 \
|
|
--max_seq_len 1024 \
|
|
--max_num_tokens 7392 \
|
|
--output_dir trt_engine/opt-125m/fp16/1-gpu/
|
|
|
|
|
|
popd # examples/models/contrib/opt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "llama" ]; then
|
|
|
|
pushd examples/models/core/llama
|
|
|
|
install_requirements
|
|
|
|
echo "Convert LLaMA from HF"
|
|
python3 convert_checkpoint.py --dtype float16 --n_layer 2 --output_dir ./c-model/llama-7b/fp16
|
|
|
|
echo "Build LLaMA"
|
|
trtllm-build --model_config ./c-model/llama-7b/fp16/config.json \
|
|
--context_fmha=enable \
|
|
--gpt_attention_plugin float16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--output_dir llama_outputs
|
|
|
|
popd # examples/models/core/llama
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "mistral" ]; then
|
|
|
|
pushd examples/models/core/llama
|
|
|
|
install_requirements
|
|
|
|
echo "Convert Mistral from HF"
|
|
python3 convert_checkpoint.py --dtype float16 \
|
|
--n_layer 2 --n_positions 32768 \
|
|
--output_dir ./c-model/mistral-7b/fp16
|
|
|
|
echo "Build Mistral"
|
|
trtllm-build --model_config ./c-model/mistral-7b/fp16/config.json \
|
|
--context_fmha=enable \
|
|
--gpt_attention_plugin float16 \
|
|
--gemm_plugin float16 \
|
|
--max_input_len 8192 \
|
|
--max_batch_size 8 \
|
|
--max_num_tokens 8192 \
|
|
--output_dir mistral_7b_outputs
|
|
|
|
popd # examples/models/core/llama
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "mistral-ib" ]; then
|
|
|
|
pushd examples/models/core/llama
|
|
|
|
install_requirements
|
|
|
|
echo "Convert Mistral from HF"
|
|
python3 convert_checkpoint.py --dtype float16 \
|
|
--model_dir ${MISTRAL} \
|
|
--output_dir ./c-model/mistral-7b/fp16
|
|
|
|
echo "Build Mistral with inflight batching"
|
|
trtllm-build --checkpoint_dir ./c-model/mistral-7b/fp16/ \
|
|
--context_fmha=enable \
|
|
--remove_input_padding=enable \
|
|
--kv_cache_type=paged \
|
|
--gpt_attention_plugin float16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 1 \
|
|
--max_seq_len 9216 \
|
|
--use_paged_context_fmha disable \
|
|
--max_beam_width 2 \
|
|
--output_dir ib_mistral_7b_outputs
|
|
|
|
popd # examples/models/core/llama
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "gptj" ]; then
|
|
|
|
pushd examples/models/contrib/gptj
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT-J from HF"
|
|
python3 convert_checkpoint.py --dtype float16 --n_layer 2 --output_dir ./c-model/gpt-j-6b/fp16
|
|
|
|
echo "Build GPT-J"
|
|
trtllm-build --model_config ./c-model/gpt-j-6b/fp16/config.json \
|
|
--context_fmha=enable \
|
|
--gpt_attention_plugin float16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--output_dir gptj_outputs
|
|
|
|
popd # examples/models/contrib/gptj
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "gpt-ib" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT from HF"
|
|
python3 convert_checkpoint.py --model_dir ${GPT2} --dtype float16 --output_dir ./c-model/gpt2/fp16
|
|
|
|
echo "Build GPT: float16"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--use_paged_context_fmha enable \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--max_num_tokens 7392 \
|
|
--gather_generation_logits \
|
|
--output_dir trt_engine/gpt2-ib/fp16/1-gpu/
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "gpt-ib-lad" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
# install_requirements
|
|
|
|
echo "Convert GPT from HF"
|
|
python3 convert_checkpoint.py --model_dir ${GPT2} --dtype float16 --output_dir ./c-model/gpt2/fp16
|
|
|
|
echo "Build GPT: float16"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--use_paged_context_fmha enable \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--max_num_tokens 7392 \
|
|
--gather_generation_logits \
|
|
--max_draft_len=83 \
|
|
--speculative_decoding_mode=lookahead_decoding \
|
|
--output_dir trt_engine/gpt2-ib-lad/fp16/1-gpu/
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "bart-ib" ] || [ "$MODEL" = "t5-ib" ]; then
|
|
|
|
pushd examples/models/core/enc_dec
|
|
|
|
if [ "$MODEL" = "bart-ib" ]; then
|
|
MODEL_DIR=${BART}
|
|
MODEL_TYPE="bart"
|
|
elif [ "$MODEL" = "t5-ib" ]; then
|
|
MODEL_DIR=${T5}
|
|
MODEL_TYPE="t5"
|
|
fi
|
|
echo "Convert ${MODEL_TYPE} from HF"
|
|
python3 convert_checkpoint.py --model_type ${MODEL_TYPE} --model_dir ${MODEL_DIR} --dtype float16 --output_dir ./c-model/${MODEL}/fp16
|
|
|
|
echo "Build Encoder: "
|
|
trtllm-build --checkpoint_dir ./c-model/${MODEL}/fp16/encoder \
|
|
--output_dir trt_engine/${MODEL}/fp16/1-gpu/encoder \
|
|
--kv_cache_type disabled --moe_plugin disable \
|
|
--max_beam_width 1 \
|
|
--max_batch_size 8 --max_input_len 512 --max_seq_len 512 \
|
|
--gemm_plugin float16 \
|
|
--bert_attention_plugin float16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable --context_fmha disable
|
|
|
|
|
|
echo "Build Decoder:"
|
|
trtllm-build --checkpoint_dir ./c-model/${MODEL}/fp16/decoder \
|
|
--output_dir trt_engine/${MODEL}/fp16/1-gpu/decoder \
|
|
--moe_plugin disable \
|
|
--max_beam_width 1 \
|
|
--max_batch_size 8 --max_input_len 1 --max_seq_len 512 --max_encoder_input_len 512 \
|
|
--gemm_plugin float16 \
|
|
--bert_attention_plugin float16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable --context_fmha disable
|
|
|
|
popd # examples/models/core/enc_dec
|
|
|
|
fi
|
|
|
|
|
|
if [ "$MODEL" = "gpt-medium-ib" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT2 medium from HF"
|
|
python3 convert_checkpoint.py --model_dir ${GPT2_MEDIUM} --dtype float16 --output_dir ./c-model/gpt2-medium/fp16
|
|
|
|
echo "Build GPT2 medium control: float16"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2-medium/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--gemm_plugin float16 \
|
|
--context_fmha enable \
|
|
--use_paged_context_fmha enable \
|
|
--max_batch_size 8 \
|
|
--max_num_tokens 7392 \
|
|
--gather_generation_logits \
|
|
--output_dir trt_engine/gpt2-medium-ib/fp16/1-gpu/
|
|
|
|
echo "Build GPT2 medium target: float16"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2-medium/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--gemm_plugin float16 \
|
|
--context_fmha enable \
|
|
--use_paged_context_fmha enable \
|
|
--max_draft_len 5 \
|
|
--speculative_decoding_mode draft_tokens_external \
|
|
--max_batch_size 8 \
|
|
--max_num_tokens 7392 \
|
|
--gather_generation_logits \
|
|
--output_dir trt_engine/gpt2-medium-ib-target/fp16/1-gpu/
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "gpt-ib-ptuning" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT from NeMo"
|
|
python3 convert_checkpoint.py --nemo_ckpt_path ${GPT2_NEXT_PTUNING}/megatron_converted_8b_tp4_pp1.nemo --dtype float16 --output_dir ./c-model/email_composition/fp16
|
|
|
|
echo "Convert ptuning table"
|
|
python3 nemo_prompt_convert.py -i ${GPT2_NEXT_PTUNING}/email_composition.nemo -o email_composition.npy
|
|
|
|
cp ${GPT2_NEXT_PTUNING}/input.csv ./
|
|
|
|
echo "Build GPT: float16"
|
|
trtllm-build --checkpoint_dir ./c-model/email_composition/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--gemm_plugin float16 \
|
|
--context_fmha enable \
|
|
--max_batch_size 4 --max_seq_len 256 --max_beam_width 1 \
|
|
--max_num_tokens 512 \
|
|
--output_dir trt_engine/email_composition/fp16/1-gpu/ \
|
|
--max_prompt_embedding_table_size 300
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "gpt-2b-ib-lora" ]; then
|
|
|
|
# GPT-2B
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT from NeMo"
|
|
python3 convert_checkpoint.py --nemo_ckpt_path ${GPT_2B} --dtype float16 --output_dir ./c-model/gpt-2b-lora/fp16
|
|
|
|
echo "Build GPT: float16"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt-2b-lora/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--gemm_plugin float16 \
|
|
--lora_plugin float16 \
|
|
--lora_dir ${GPT_2B_LORA}/gpt2b_lora-900.nemo \
|
|
--lora_ckpt_source nemo \
|
|
--lora_target_modules attn_qkv \
|
|
--max_batch_size 8 \
|
|
--max_num_tokens 7392 \
|
|
--output_dir trt_engine/gpt-2b-lora-ib/fp16/1-gpu/
|
|
|
|
python3 nemo_lora_convert.py -i ${GPT_2B_LORA}/gpt2b_lora-900.nemo \
|
|
-o gpt-2b-lora-train-900 --write-cpp-runtime-tensors --storage-type float16
|
|
python3 nemo_lora_convert.py -i ${GPT_2B_LORA}/gpt2b_lora-900.nemo \
|
|
-o gpt-2b-lora-train-900-tllm --storage-type float16
|
|
cp ${GPT_2B_LORA}/gpt2b_lora-900.nemo .
|
|
|
|
cp ${GPT_2B_LORA}/input.csv .
|
|
|
|
popd # examples/models/core/gpt
|
|
fi
|
|
|
|
if [ "$MODEL" = "gpt-gather-logits" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT from HF"
|
|
python3 convert_checkpoint.py --model_dir ${GPT2} --dtype float16 --output_dir ./c-model/gpt2/fp16
|
|
|
|
echo "Build GPT: float16 | gather_all_token_logits"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2/fp16 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--gemm_plugin float16 \
|
|
--context_fmha enable \
|
|
--max_batch_size 128 --max_seq_len 600 \
|
|
--gather_all_token_logits \
|
|
--output_dir trt_engine/gpt2-gather-logits/fp16/1-gpu/ \
|
|
--max_num_tokens 38400
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "medusa" ]; then
|
|
|
|
# Medusa
|
|
pushd examples/medusa
|
|
|
|
install_requirements
|
|
|
|
echo "Convert Medusa from HF"
|
|
python convert_checkpoint.py --model_dir ${VICUNA} \
|
|
--medusa_model_dir ${MEDUSA_VICUNA} \
|
|
--output_dir ./tllm_checkpoint_1gpu_medusa \
|
|
--dtype float16 \
|
|
--num_medusa_heads 4
|
|
|
|
echo "Build Medusa: float16"
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
|
|
--output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
|
|
--gemm_plugin float16 \
|
|
--speculative_decoding_mode medusa \
|
|
--max_batch_size 8 --max_seq_len 600 \
|
|
--max_num_tokens 2400
|
|
|
|
popd # examples/medusa
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "eagle" ]; then
|
|
|
|
# Eagle
|
|
pushd examples/eagle
|
|
|
|
install_requirements
|
|
|
|
echo "Convert Eagle from HF"
|
|
python convert_checkpoint.py --model_dir ${VICUNA} \
|
|
--eagle_model_dir ${EAGLE_VICUNA} \
|
|
--output_dir ./tllm_checkpoint_1gpu_eagle \
|
|
--dtype float16 \
|
|
--num_eagle_layers 4
|
|
|
|
echo "Build Eagle: float16"
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_eagle \
|
|
--output_dir ./tmp/eagle/7B/trt_engines/fp16/1-gpu/ \
|
|
--gemm_plugin float16 \
|
|
--speculative_decoding_mode eagle \
|
|
--max_batch_size 8 --max_seq_len 600 \
|
|
--max_num_tokens 2400
|
|
|
|
popd # examples/eagle
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "gpt-gather-generation-logits" ]; then
|
|
|
|
# GPT2
|
|
pushd examples/models/core/gpt
|
|
|
|
install_requirements
|
|
|
|
echo "Convert GPT from HF"
|
|
python3 convert_checkpoint.py --model_dir ${GPT2} --dtype float16 --output_dir ./c-model/gpt2/fp16
|
|
|
|
# draft model, only gather_generation_logits
|
|
echo "Build GPT: float16 | gather_all_token_logits"
|
|
trtllm-build --checkpoint_dir ./c-model/gpt2/fp16 \
|
|
--max_batch_size 4 \
|
|
--max_seq_len 640 \
|
|
--gpt_attention_plugin float16 \
|
|
--remove_input_padding enable \
|
|
--kv_cache_type paged \
|
|
--context_fmha enable \
|
|
--max_num_tokens 38400 \
|
|
--use_paged_context_fmha enable \
|
|
--gather_generation_logits \
|
|
--output_dir trt_engine/gpt2-draft-gather-generation-logits/fp16/1-gpu/
|
|
|
|
popd # examples/models/core/gpt
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "blip2-opt" ]; then
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert OPT from HF"
|
|
python3 ../../contrib/opt/convert_checkpoint.py --model_type blip2 --model_dir ${BLIP2_OPT_2_7B} --dtype float16 --output_dir ./c-model/opt-2.7b/fp16
|
|
|
|
echo "OPT builder"
|
|
trtllm-build --checkpoint_dir ./c-model/opt-2.7b/fp16 \
|
|
--gemm_plugin float16 \
|
|
--max_beam_width 1 \
|
|
--max_batch_size 8 \
|
|
--max_multimodal_len 256 \
|
|
--max_input_len 924 \
|
|
--max_seq_len 1024 \
|
|
--use_paged_context_fmha enable \
|
|
--output_dir trt_engines/opt-2.7b/fp16/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_type blip2 --model_path ${BLIP2_OPT_2_7B} --max_batch_size 8
|
|
|
|
popd # examples/models/core/multimodal
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "llava" ]; then
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert LLaMA from HF"
|
|
python3 ../llama/convert_checkpoint.py --model_dir ${LLAVA_7B} --dtype float16 --output_dir ./c-model/llava-1.5-7b-hf/fp16
|
|
|
|
echo "LLAVA builder"
|
|
trtllm-build --checkpoint_dir ./c-model/llava-1.5-7b-hf/fp16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--max_input_len 2048 \
|
|
--max_seq_len 2560 \
|
|
--max_multimodal_len 4608 \
|
|
--output_dir trt_engines/llava-1.5-7b-hf/fp16/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_path ${LLAVA_7B} --model_type llava --max_batch_size 8
|
|
|
|
popd # examples/models/core/multimodal
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "llava_fp8" ]; then
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert LLaVA from HF to FP8"
|
|
python3 ../../../quantization/quantize.py \
|
|
--model_dir ${LLAVA_7B} \
|
|
--dtype float16 \
|
|
--qformat fp8 \
|
|
--kv_cache_dtype fp8 \
|
|
--output_dir ./c-model/llava-1.5-7b-hf/fp8 \
|
|
--calib_size 512
|
|
|
|
echo "LLAVA builder for FP8"
|
|
trtllm-build --checkpoint_dir ./c-model/llava-1.5-7b-hf/fp8 \
|
|
--gemm_plugin auto \
|
|
--max_batch_size 8 \
|
|
--max_input_len 2048 \
|
|
--max_seq_len 2560 \
|
|
--max_multimodal_len 4608 \
|
|
--output_dir trt_engines/llava-1.5-7b-hf/fp8/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_path ${LLAVA_7B} --model_type llava --max_batch_size 8
|
|
|
|
popd # examples/models/core/multimodal
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "vila" ]; then
|
|
|
|
echo "Install vila requirements"
|
|
pip install -r $LLM_BACKEND_ROOT/all_models/multimodal/requirements-vila.txt
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert LLaMA from HF"
|
|
python3 ../llama/convert_checkpoint.py --model_dir ${VILA1_5_3B} --dtype float16 --output_dir ./c-model/vila1.5-3b/fp16
|
|
|
|
echo "VILA builder"
|
|
trtllm-build --checkpoint_dir ./c-model/vila1.5-3b/fp16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--max_input_len 2048 \
|
|
--max_seq_len 2560 \
|
|
--max_multimodal_len 6272 \
|
|
--output_dir trt_engines/vila1.5-3b/fp16/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_path ${VILA1_5_3B} --model_type vila --max_batch_size 32 --vila_path ${VILA_PATH}
|
|
|
|
popd # examples/models/core/multimodal
|
|
fi
|
|
|
|
if [ "$MODEL" = "mllama" ]; then
|
|
|
|
echo "Install mllama requirements"
|
|
pip install -r $LLM_BACKEND_ROOT/all_models/multimodal/requirements-mllama.txt
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert mllama from HF"
|
|
python3 ../mllama/convert_checkpoint.py --model_dir ${LLAMA_3_2_11B_VISION} --dtype bfloat16 --output_dir ./c-model/Llama-3.2-11B-Vision-Instruct/bf16
|
|
|
|
echo "mllama builder"
|
|
trtllm-build --checkpoint_dir ./c-model/Llama-3.2-11B-Vision-Instruct/bf16 \
|
|
--gemm_plugin auto \
|
|
--max_batch_size 8 \
|
|
--max_input_len 2048 \
|
|
--max_seq_len 2560 \
|
|
--max_encoder_input_len 8200 \
|
|
--output_dir trt_engines/Llama-3.2-11B-Vision-Instruct/bf16/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_path ${LLAMA_3_2_11B_VISION} \
|
|
--model_type mllama --max_batch_size 8 \
|
|
--output_dir tmp/trt_engines/Llama-3.2-11B-Vision-Instruct/multimodal_encoder
|
|
|
|
popd # examples/models/core/multimodal
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "whisper" ]; then
|
|
|
|
pushd examples/models/core/whisper
|
|
|
|
echo "Convert OpenAI Whisper Checkpoint"
|
|
python3 convert_checkpoint.py --model_dir ${WHISPER_LAREGE_V3} --output_dir ./c-model/${MODEL}/tllm_checkpoint
|
|
|
|
echo "Build Whisper Encoder: "
|
|
trtllm-build --checkpoint_dir ./c-model/${MODEL}/tllm_checkpoint/encoder \
|
|
--output_dir trt_engine/${MODEL}/encoder \
|
|
--moe_plugin disable \
|
|
--max_batch_size 8 \
|
|
--gemm_plugin disable \
|
|
--bert_attention_plugin float16 \
|
|
--max_input_len 3000 --max_seq_len=3000
|
|
|
|
echo "Build Whisper Decoder: "
|
|
trtllm-build --checkpoint_dir ./c-model/${MODEL}/tllm_checkpoint/decoder \
|
|
--output_dir trt_engine/${MODEL}/decoder \
|
|
--moe_plugin disable \
|
|
--max_beam_width 4 \
|
|
--max_batch_size 8 \
|
|
--max_seq_len 114 \
|
|
--max_input_len 14 \
|
|
--max_encoder_input_len 3000 \
|
|
--gemm_plugin float16 \
|
|
--bert_attention_plugin float16 \
|
|
--gpt_attention_plugin float16
|
|
popd # examples/models/core/whisper
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "llava_onevision" ]; then
|
|
|
|
echo "Install llava_onevision requirements"
|
|
pip install -r $LLM_BACKEND_ROOT/all_models/multimodal/requirements-llava-onevision.txt
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert Qwen from HF"
|
|
python3 ../qwen/convert_checkpoint.py --model_dir ${LLAVA_ONEVISION_7B} --dtype float16 --output_dir ./c-model/llava-7b/fp16
|
|
|
|
echo "Qwen builder"
|
|
trtllm-build --checkpoint_dir ./c-model/llava-7b/fp16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 1 \
|
|
--max_input_len 7500 \
|
|
--max_seq_len 7600 \
|
|
--max_multimodal_len 7300 \
|
|
--output_dir trt_engines/llava-onevision-7b/fp16/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_path ${LLAVA_ONEVISION_7B} --model_type llava_onevision --max_batch_size 16 --output_dir tmp/trt_engines/llava-onevision-qwen2-7b-ov-hf/multimodal_encoder
|
|
|
|
popd # examples/models/core/multimodal
|
|
|
|
fi
|
|
|
|
if [ "$MODEL" = "qwen2_vl" ]; then
|
|
echo "Install llava_onevision requirements"
|
|
pip install -r $LLM_BACKEND_ROOT/all_models/multimodal/requirements-qwen2vl.txt
|
|
|
|
pushd examples/models/core/multimodal
|
|
|
|
echo "Convert Qwen from HF"
|
|
python3 ../qwen/convert_checkpoint.py --model_dir ${QWEN2_VL_7B} --dtype float16 --output_dir ./c-model/qwen2-vl-7b/fp16
|
|
|
|
echo "Qwen builder"
|
|
trtllm-build --checkpoint_dir ./c-model/qwen2-vl-7b/fp16 \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 4 \
|
|
--max_input_len 2048 \
|
|
--max_seq_len 76307200 \
|
|
--max_multimodal_len 1296 \
|
|
--output_dir trt_engines/qwen2-vl-7b/fp16/1-gpu
|
|
|
|
python build_multimodal_engine.py --model_path ${QWEN2_VL_7B} --model_type qwen2_vl --output_dir tmp/trt_engines/Qwen2-VL-7B-Instruct/multimodal_encoder
|
|
|
|
popd # examples/models/core/multimodal
|
|
|
|
fi
|
|
|
|
popd # $LLM_ROOT
|