mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
191 lines
8.0 KiB
Bash
Executable File
191 lines
8.0 KiB
Bash
Executable File
set -ex
|
|
|
|
export PATH=~/.local/bin/:$PATH # trtllm-build is inside ~/.local/bin
|
|
export MODEL=/home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf/
|
|
|
|
test_fake_config() {
|
|
python3 convert_checkpoint.py --dtype float16 --n_layer 2 --output_dir ./c-model/llama-7b/fp16
|
|
trtllm-build --model_config ./c-model/llama-7b/fp16/config.json \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 8 \
|
|
--output_dir ./llama_nlayer_2
|
|
python3 ../run.py --max_output_len=1 --engine_dir ./llama_nlayer_2
|
|
}
|
|
|
|
test_meta() {
|
|
python convert_checkpoint.py --meta_ckpt_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/7B/ --output_dir ./tllm_checkpoint/llama-v2-7b-ckpt-from-meta --tp_size 2
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/llama-v2-7b-ckpt-from-meta --output_dir ./trt_engines/llama-v2-7b-engine-tp2-meta --gemm_plugin float16
|
|
mpirun -n 2 --allow-run-as-root \
|
|
python ../summarize.py --test_trt_llm \
|
|
--tensorrt_llm_rouge1_threshold 18 \
|
|
--hf_model_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/llama-v2-7b-hf/ \
|
|
--data_type fp16 \
|
|
--engine_dir ./trt_engines/llama-v2-7b-engine-tp2-meta \
|
|
--test_hf
|
|
}
|
|
|
|
|
|
test_hf() {
|
|
python convert_checkpoint.py --model_dir ${MODEL} --output_dir ./tllm_checkpoint/tp2_hf --tp_size 2 --workers 2
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/tp2_hf --output_dir ./trt_engines/llama-v2-7b-engine-tp2 --gemm_plugin float16
|
|
mpirun -n 2 --allow-run-as-root \
|
|
python ../summarize.py --test_trt_llm \
|
|
--tensorrt_llm_rouge1_threshold 18 \
|
|
--hf_model_dir ${MODEL} \
|
|
--data_type fp16 \
|
|
--engine_dir ./trt_engines/llama-v2-7b-engine-tp2 \
|
|
--test_hf
|
|
}
|
|
|
|
|
|
test_hf_by_shard() {
|
|
python convert_checkpoint.py --model_dir ${MODEL} --output_dir ./tllm_checkpoint/tp2_hf-by-shard --tp_size 2 --workers 2 --load_by_shard
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/tp2_hf-by-shard --output_dir ./trt_engines/llama-v2-7b-engine-tp2-by-shard --gemm_plugin float16
|
|
mpirun -n 2 --allow-run-as-root \
|
|
python ../summarize.py --test_trt_llm \
|
|
--tensorrt_llm_rouge1_threshold 18 \
|
|
--hf_model_dir ${MODEL} \
|
|
--data_type fp16 \
|
|
--engine_dir ./trt_engines/llama-v2-7b-engine-tp2-by-shard \
|
|
--test_hf
|
|
}
|
|
|
|
|
|
test_wo_int8() {
|
|
python convert_checkpoint.py --model_dir ${MODEL} \
|
|
--output_dir ./tllm_checkpoint/1gpu_fp16_wq \
|
|
--dtype float16 \
|
|
--use_weight_only \
|
|
--weight_only_precision int8 \
|
|
--int8_kv_cache
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/1gpu_fp16_wq \
|
|
--output_dir trt_engines/int8_kv_cache_weight_only/1-gpu \
|
|
--gemm_plugin float16 \
|
|
|
|
python ../summarize.py --test_trt_llm \
|
|
--hf_model_dir ${MODEL} \
|
|
--data_type fp16 \
|
|
--engine_dir trt_engines/int8_kv_cache_weight_only/1-gpu \
|
|
--test_hf
|
|
}
|
|
|
|
test_sq() {
|
|
python3 convert_checkpoint.py --model_dir ${MODEL} --output_dir ./tllm_checkpoint/sq --dtype float16 --smoothquant 0.5 --int8_kv_cache
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/sq --output_dir ./trt_engines/sq --gemm_plugin float16
|
|
python ../summarize.py --test_trt_llm --hf_model_dir ${MODEL} --data_type fp16 --engine_dir trt_engines/sq --test_hf
|
|
}
|
|
|
|
|
|
test_gptq() {
|
|
python convert_checkpoint.py --model_dir ${MODEL} \
|
|
--output_dir ./tllm_checkpoint/2gpu_gptq \
|
|
--dtype float16 \
|
|
--quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
|
|
--use_weight_only \
|
|
--weight_only_precision int4_gptq \
|
|
--per_group \
|
|
--tp_size 2 \
|
|
--workers 2
|
|
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/2gpu_gptq \
|
|
--output_dir ./trt_engines/gptq \
|
|
--gemm_plugin float16
|
|
|
|
mpirun -n 2 --allow-run-as-root \
|
|
python ../summarize.py --test_trt_llm \
|
|
--hf_model_dir ${MODEL} \
|
|
--data_type fp16 \
|
|
--engine_dir trt_engines/gptq \
|
|
--test_hf
|
|
}
|
|
|
|
test_lora() {
|
|
lora_dir=/home/scratch.trt_llm_data/llm-models/llama-models-v2/chinese-llama-2-lora-13b
|
|
python convert_checkpoint.py --model_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/llama-v2-13b-hf \
|
|
--output_dir ./tllm_checkpoint/2gpu_lora \
|
|
--dtype float16 \
|
|
--tp_size 2
|
|
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/2gpu_lora \
|
|
--output_dir ./trt_engines/llama-v2-13b-with-lora \
|
|
--gemm_plugin float16 \
|
|
--lora_plugin float16 \
|
|
--lora_dir ${lora_dir} \
|
|
--max_batch_size 1 \
|
|
--max_input_len 512 \
|
|
--max_seq_len 562
|
|
|
|
mpirun -n 2 --allow-run-as-root \
|
|
python ../run.py --engine_dir ./trt_engines/llama-v2-13b-with-lora \
|
|
--max_output_len 50 \
|
|
--tokenizer_dir ${lora_dir} \
|
|
--input_text "今天天气很好,我到公园的时候," \
|
|
--lora_task_uids 0 \
|
|
--no_add_special_tokens \
|
|
--use_py_session
|
|
}
|
|
|
|
test_mixtral() {
|
|
python convert_checkpoint.py --model_dir /home/scratch.trt_llm_data/llm-models/Mixtral-8x7B-v0.1/ \
|
|
--output_dir ./tllm_checkpoint/mixtral_2gpu \
|
|
--dtype float16 \
|
|
--pp_size 2 \
|
|
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/mixtral_2gpu \
|
|
--output_dir ./trt_engines/mixtral/pp2 \
|
|
--gemm_plugin float16
|
|
}
|
|
|
|
test_long_alpaca_rope_scaling() {
|
|
python convert_checkpoint.py --model_dir /home/scratch.trt_llm_data/llm-models/LongAlpaca-7B/ \
|
|
--output_dir ./tllm_checkpoint/long_alpaca_tp2 \
|
|
--dtype float16 \
|
|
--tp_size 2
|
|
trtllm-build --checkpoint_dir ./tllm_checkpoint/long_alpaca_tp2 \
|
|
--output_dir ./trt_engines/long_alpaca_tp2 \
|
|
--gemm_plugin float16 \
|
|
--max_input_len 32768 \
|
|
|
|
mpirun -n 2 --allow-run-as-root \
|
|
python ../run.py \
|
|
--max_output_len 128 \
|
|
--max_input_length 32768 \
|
|
--input_file ../../tests/llm-test-defs/turtle/test_input_files/pg64317_sanitized.txt \
|
|
--engine_dir ./trt_engines/long_alpaca_tp2 \
|
|
--tokenizer_dir /home/scratch.trt_llm_data/llm-models/LongAlpaca-7B/
|
|
}
|
|
|
|
test_llava() {
|
|
python ../llama/convert_checkpoint.py \
|
|
--model_dir /home/scratch.trt_llm_data/llm-models/llava-1.5-7b-hf/ \
|
|
--output_dir ./trt_checkpoint/llava-1gpu \
|
|
--dtype float16
|
|
|
|
trtllm-build \
|
|
--checkpoint_dir ./trt_checkpoint/llava-1gpu \
|
|
--output_dir ./trt_engines/llava/fp16/1-gpu \
|
|
--gemm_plugin float16 \
|
|
--max_batch_size 1 \
|
|
--max_input_len 2048 \
|
|
--max_seq_len 2560 \
|
|
--max_multimodal_len 576 # 1 (max_batch_size) * 576 (num_visual_features)
|
|
}
|
|
|
|
test_bfloat16() {
|
|
python convert_checkpoint.py --output_dir ./tllm_checkpoint/llama_v2-summarization/bfloat16/1-gpu --dtype=bfloat16 --tp_size=1 --pp_size=1 --model_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/llama-v2-7b-hf
|
|
}
|
|
|
|
test_all()
|
|
{
|
|
test_fake_config
|
|
test_meta
|
|
test_hf
|
|
test_wo_int8
|
|
test_sq
|
|
test_gptq
|
|
test_lora
|
|
test_mixtral
|
|
test_long_alpaca_rope_scaling
|
|
test_llava
|
|
}
|