TensorRT-LLMs/tests/test_llama_conversion.sh

set -ex

export PATH=~/.local/bin/:$PATH # trtllm-build is inside ~/.local/bin
export MODEL=/home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf/

test_fake_config() {
    python3 convert_checkpoint.py --dtype float16 --n_layer 2 --output_dir ./c-model/llama-7b/fp16
    trtllm-build --model_config ./c-model/llama-7b/fp16/config.json  \
        --gemm_plugin float16 \
        --max_batch_size 8 \
        --output_dir ./llama_nlayer_2
    python3 ../run.py --max_output_len=1 --engine_dir ./llama_nlayer_2
}

test_meta() {
    python convert_checkpoint.py --meta_ckpt_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/7B/ --output_dir ./tllm_checkpoint/llama-v2-7b-ckpt-from-meta --tp_size 2
    trtllm-build --checkpoint_dir ./tllm_checkpoint/llama-v2-7b-ckpt-from-meta  --output_dir ./trt_engines/llama-v2-7b-engine-tp2-meta  --gemm_plugin float16
    mpirun -n 2 --allow-run-as-root \
    python ../summarize.py --test_trt_llm \
                           --tensorrt_llm_rouge1_threshold 18 \
                           --hf_model_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/llama-v2-7b-hf/ \
                           --data_type fp16 \
                           --engine_dir ./trt_engines/llama-v2-7b-engine-tp2-meta \
                           --test_hf
}


test_hf() {
    python convert_checkpoint.py --model_dir ${MODEL} --output_dir ./tllm_checkpoint/tp2_hf  --tp_size 2 --workers 2
    trtllm-build --checkpoint_dir ./tllm_checkpoint/tp2_hf --output_dir ./trt_engines/llama-v2-7b-engine-tp2  --gemm_plugin float16
    mpirun -n 2 --allow-run-as-root \
    python ../summarize.py --test_trt_llm \
                            --tensorrt_llm_rouge1_threshold 18 \
                            --hf_model_dir ${MODEL} \
                            --data_type fp16 \
                            --engine_dir ./trt_engines/llama-v2-7b-engine-tp2 \
                            --test_hf
}


test_hf_by_shard() {
    python convert_checkpoint.py --model_dir ${MODEL} --output_dir ./tllm_checkpoint/tp2_hf-by-shard  --tp_size 2 --workers 2 --load_by_shard
    trtllm-build --checkpoint_dir ./tllm_checkpoint/tp2_hf-by-shard --output_dir ./trt_engines/llama-v2-7b-engine-tp2-by-shard  --gemm_plugin float16
    mpirun -n 2 --allow-run-as-root \
    python ../summarize.py --test_trt_llm \
                            --tensorrt_llm_rouge1_threshold 18 \
                            --hf_model_dir ${MODEL} \
                            --data_type fp16 \
                            --engine_dir ./trt_engines/llama-v2-7b-engine-tp2-by-shard \
                            --test_hf
}


test_wo_int8() {
    python convert_checkpoint.py --model_dir ${MODEL} \
                              --output_dir ./tllm_checkpoint/1gpu_fp16_wq \
                              --dtype float16 \
                              --use_weight_only \
                              --weight_only_precision int8 \
                              --int8_kv_cache
    trtllm-build --checkpoint_dir ./tllm_checkpoint/1gpu_fp16_wq \
            --output_dir trt_engines/int8_kv_cache_weight_only/1-gpu \
            --gemm_plugin float16 \

    python ../summarize.py --test_trt_llm \
                       --hf_model_dir ${MODEL} \
                       --data_type fp16 \
                       --engine_dir trt_engines/int8_kv_cache_weight_only/1-gpu \
                       --test_hf
}

test_sq() {
    python3 convert_checkpoint.py --model_dir ${MODEL} --output_dir ./tllm_checkpoint/sq --dtype float16 --smoothquant 0.5 --int8_kv_cache
    trtllm-build --checkpoint_dir ./tllm_checkpoint/sq  --output_dir ./trt_engines/sq  --gemm_plugin float16
    python ../summarize.py --test_trt_llm --hf_model_dir ${MODEL}  --data_type fp16  --engine_dir trt_engines/sq  --test_hf
}


test_gptq() {
    python convert_checkpoint.py --model_dir ${MODEL} \
                                 --output_dir ./tllm_checkpoint/2gpu_gptq \
                                 --dtype float16 \
                                 --quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
                                 --use_weight_only \
                                 --weight_only_precision int4_gptq \
                                 --per_group \
                                 --tp_size 2 \
                                 --workers 2

    trtllm-build --checkpoint_dir ./tllm_checkpoint/2gpu_gptq \
             --output_dir ./trt_engines/gptq \
             --gemm_plugin float16

    mpirun -n 2 --allow-run-as-root \
    python ../summarize.py --test_trt_llm \
                       --hf_model_dir ${MODEL} \
                       --data_type fp16 \
                       --engine_dir trt_engines/gptq \
                       --test_hf
}

test_lora() {
    lora_dir=/home/scratch.trt_llm_data/llm-models/llama-models-v2/chinese-llama-2-lora-13b
    python convert_checkpoint.py --model_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/llama-v2-13b-hf \
                         --output_dir ./tllm_checkpoint/2gpu_lora \
                         --dtype float16 \
                         --tp_size 2

    trtllm-build --checkpoint_dir   ./tllm_checkpoint/2gpu_lora \
            --output_dir ./trt_engines/llama-v2-13b-with-lora \
            --gemm_plugin float16 \
            --lora_plugin float16 \
            --lora_dir ${lora_dir} \
            --max_batch_size 1 \
            --max_input_len 512 \
            --max_seq_len 562

    mpirun -n 2 --allow-run-as-root \
    python ../run.py --engine_dir ./trt_engines/llama-v2-13b-with-lora \
              --max_output_len 50 \
              --tokenizer_dir ${lora_dir} \
              --input_text "今天天气很好，我到公园的时候，" \
              --lora_task_uids 0 \
              --no_add_special_tokens \
              --use_py_session
}

test_mixtral() {
    python convert_checkpoint.py --model_dir /home/scratch.trt_llm_data/llm-models/Mixtral-8x7B-v0.1/ \
                                 --output_dir ./tllm_checkpoint/mixtral_2gpu \
                                 --dtype float16 \
                                 --pp_size 2 \

    trtllm-build --checkpoint_dir ./tllm_checkpoint/mixtral_2gpu \
                    --output_dir ./trt_engines/mixtral/pp2 \
                    --gemm_plugin float16
}

test_long_alpaca_rope_scaling() {
    python convert_checkpoint.py --model_dir /home/scratch.trt_llm_data/llm-models/LongAlpaca-7B/ \
                            --output_dir ./tllm_checkpoint/long_alpaca_tp2 \
                            --dtype float16 \
                            --tp_size 2
    trtllm-build --checkpoint_dir ./tllm_checkpoint/long_alpaca_tp2 \
                 --output_dir ./trt_engines/long_alpaca_tp2 \
                 --gemm_plugin float16 \
                --max_input_len 32768 \

    mpirun -n 2 --allow-run-as-root \
    python ../run.py \
        --max_output_len 128 \
        --max_input_length 32768 \
        --input_file ../../tests/llm-test-defs/turtle/test_input_files/pg64317_sanitized.txt \
        --engine_dir ./trt_engines/long_alpaca_tp2  \
        --tokenizer_dir /home/scratch.trt_llm_data/llm-models/LongAlpaca-7B/
}

test_llava() {
    python ../llama/convert_checkpoint.py \
        --model_dir /home/scratch.trt_llm_data/llm-models/llava-1.5-7b-hf/ \
        --output_dir ./trt_checkpoint/llava-1gpu \
        --dtype float16

    trtllm-build \
        --checkpoint_dir ./trt_checkpoint/llava-1gpu \
        --output_dir ./trt_engines/llava/fp16/1-gpu \
        --gemm_plugin float16 \
        --max_batch_size 1 \
        --max_input_len 2048 \
        --max_seq_len 2560 \
        --max_multimodal_len 576 # 1 (max_batch_size) * 576 (num_visual_features)
}

test_bfloat16() {
    python convert_checkpoint.py --output_dir ./tllm_checkpoint/llama_v2-summarization/bfloat16/1-gpu --dtype=bfloat16 --tp_size=1 --pp_size=1 --model_dir /home/scratch.trt_llm_data/llm-models/llama-models-v2/llama-v2-7b-hf
}

test_all()
{
    test_fake_config
    test_meta
    test_hf
    test_wo_int8
    test_sq
    test_gptq
    test_lora
    test_mixtral
    test_long_alpaca_rope_scaling
    test_llava
}