TensorRT-LLMs/benchmarks/python/kv_cache_offload/run.sh
Dan Blanaru 16d2467ea8 Update TensorRT-LLM (#2755)
* Update TensorRT-LLM

---------

Co-authored-by: Denis Kayshev <topenkoff@gmail.com>
Co-authored-by: akhoroshev <arthoroshev@gmail.com>
Co-authored-by: Patrick Reiter Horn <patrick.horn@gmail.com>

Update
2025-02-11 03:01:00 +00:00

50 lines
1.2 KiB
Bash

#!/bin/bash
# Check if the environment variable is set
if [[ -z "${HUGGING_FACE_HUB_TOKEN}" ]]; then
echo "The environment variable HUGGING_FACE_HUB_TOKEN is not set."
exit 1
fi
# Get GPU name using nvidia-smi
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader)
GPU="GH200"
# Check if the GPU is a GH200
if echo "$gpu_name" | grep -q "GH200"; then
GPU="GH200"
else
GPU="H100"
fi
echo "Running with ${GPU}."
# Generate context prompts of 16,000 tokens for each user
python3 $(pwd)/../../cpp/prepare_dataset.py \
--output=$(pwd)/dataset.json \
--tokenizer=meta-llama/Llama-3.1-70B token-norm-dist \
--num-requests=20 \
--input-mean=16000 \
--output-mean=64 \
--input-stdev=0 \
--output-stdev=0
# Build the model
trtllm-bench --workspace $(pwd)/${GPU} \
--model meta-llama/Llama-3.1-70B \
build \
--max_batch_size 16 \
--max_num_tokens 17800 \
--max_seq_len 17800 \
--quantization FP8
# Run the benchmark script
for user_size in $(seq 2 16); do
echo "Run benchmark with user size = ${user_size}."
python3 benchmark.py \
--model_path $(pwd)/${GPU}/meta-llama/Llama-3.1-70B/tp_1_pp_1 \
--input_dataset_path dataset.json \
--n ${user_size}
done