mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
664 lines
31 KiB
Bash
664 lines
31 KiB
Bash
#!/bin/bash
|
|
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost}
|
|
SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
|
|
BACKEND_ROOT=${BACKEND_ROOT:="/opt/tritonserver/tensorrtllm_backend"}
|
|
DATASET="$PWD/simple_data.json"
|
|
TOOLS_DIR=${BACKEND_ROOT}/tools
|
|
STREAM_DIR=${BACKEND_ROOT}/inflight_batcher_llm/client
|
|
MODEL_DIR="$PWD/triton_model_repo"
|
|
SERVER=/opt/tritonserver/bin/tritonserver
|
|
TOKENIZER_DIR=${BACKEND_ROOT}/ci/L0_backend_trtllm/tokenizer
|
|
BASE_DIR=${BACKEND_ROOT}/ci/L0_backend_trtllm
|
|
BASE_METRICS_VERIFICATION_TEST=base_metrics_verification_tests.py
|
|
BASE_METRICS_VERIFICATION_TEST_NAME=base_metrics_verification_tests
|
|
BASE_METRICS_VERIFICATION_LOG="base_metrics_verification.log"
|
|
CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py
|
|
CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log"
|
|
SERVER_PID=0
|
|
SLEEP_DURATION=3
|
|
|
|
# Force environment to use python version 3
|
|
apt update -q=2 \
|
|
&& apt install -y python-is-python3
|
|
|
|
# Helpers ===============================
|
|
function replace_config_tags {
|
|
tag_to_replace="${1}"
|
|
new_value="${2}"
|
|
config_file_path="${3}"
|
|
sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path}
|
|
|
|
}
|
|
|
|
function run_server {
|
|
SERVER_ARGS="${1}"
|
|
python3 ${BACKEND_ROOT}/scripts/launch_triton_server.py ${SERVER_ARGS} > ${SERVER_LOG} 2>&1 &
|
|
sleep 2 # allow time to obtain the pid(s)
|
|
# Read PIDs into an array, trimming whitespaces
|
|
readarray -t SERVER_PID < <(pgrep -s 0 "tritonserver")
|
|
}
|
|
|
|
# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
|
|
# success, 1 on failure
|
|
function wait_for_server_ready() {
|
|
local wait_time_secs="${1:-30}"; shift
|
|
local spids=("$@");
|
|
|
|
WAIT_RET=0
|
|
|
|
local wait_secs=$wait_time_secs
|
|
until test $wait_secs -eq 0 ; do
|
|
# Multi-GPU will spawn multiple pids
|
|
for pid in "${spids[@]}"; do
|
|
if ! kill -0 $pid > /dev/null 2>&1; then
|
|
echo "=== Server not running."
|
|
WAIT_RET=1
|
|
return
|
|
fi
|
|
done
|
|
|
|
sleep 1;
|
|
|
|
set +e
|
|
code=`curl -s -w %{http_code} ${SERVER_IPADDR}:8000/v2/health/ready`
|
|
set -e
|
|
if [ "$code" == "200" ]; then
|
|
code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' localhost:8000/v2/logging`
|
|
assert_curl_success "Failed to change log settings necessary for verification" ${BASH_LINENO}
|
|
return
|
|
fi
|
|
|
|
((wait_secs--));
|
|
done
|
|
|
|
echo "=== Timeout $wait_time_secs secs. Server not ready."
|
|
WAIT_RET=1
|
|
}
|
|
|
|
function reset_model_repo {
|
|
rm -rf triton_model_repo/
|
|
mkdir ${MODEL_DIR}
|
|
}
|
|
|
|
function kill_server {
|
|
pgrep tritonserver | xargs kill -SIGINT
|
|
}
|
|
|
|
function wait_for_server_terminated {
|
|
local wait_time_secs="${1:-30}"; shift
|
|
local spids=("$@");
|
|
for pid in "${spids[@]}"; do
|
|
WAIT_RET=1
|
|
echo "Waiting for proc ${pid} to terminate..."
|
|
local wait_secs=$wait_time_secs
|
|
until test $wait_secs -eq 0 ; do
|
|
if ! (kill -0 $pid) > /dev/null 2>&1; then
|
|
WAIT_RET=0
|
|
break
|
|
fi
|
|
sleep 1
|
|
((wait_secs--));
|
|
done
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to wait for server to terminated $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
done
|
|
ps aux
|
|
if pgrep --runstates R,S,D,I -x "trtllmExecutorW" > /dev/null; then
|
|
echo -e "Worker process still exists - failed to terminate"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
function assert_curl_success {
|
|
message="${1}"
|
|
original_line_no="${2}"
|
|
if [ "$code" != "200" ]; then
|
|
cat ./curl.out
|
|
cat ${SERVER_LOG}
|
|
echo -e "\n***\n*** ${message} : line ${original_line_no}\n***"
|
|
RET=1
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# =======================================
|
|
|
|
prerun_kill_triton_server () {
|
|
pkill -9 -f trtllmExecutorWorker || true
|
|
pkill -9 -f tritonserver
|
|
}
|
|
|
|
# Kill titonserver if it is still pending from previous test
|
|
prerun_kill_triton_server || true
|
|
|
|
rm -f *.log *.out *.txt
|
|
# Generate TRT_LLM engines and install dependencies
|
|
source ./generate_engines.sh
|
|
pip3 install --upgrade tritonclient[all] pandas tabulate
|
|
|
|
export AVAILABLE_GPUS=$(nvidia-smi -L | wc -l)
|
|
|
|
RET=0
|
|
|
|
NUM_GPUS_TO_TEST=("1" "2" "4")
|
|
for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
|
|
if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then
|
|
break
|
|
fi
|
|
|
|
SERVER_ARGS="--world_size=${NUM_GPU} --model_repo=${MODEL_DIR}"
|
|
|
|
reset_model_repo
|
|
|
|
cp -r ${BACKEND_ROOT}/all_models/inflight_batcher_llm/* ${MODEL_DIR}
|
|
rm -rf ${MODEL_DIR}/tensorrt_llm_bls
|
|
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/ensemble/config.pbtxt"
|
|
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/preprocessing/config.pbtxt"
|
|
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/preprocessing/config.pbtxt"
|
|
replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt"
|
|
replace_config_tags '${max_queue_size}' '0' "${MODEL_DIR}/preprocessing/config.pbtxt"
|
|
replace_config_tags '${max_queue_delay_microseconds}' '50000' "${MODEL_DIR}/preprocessing/config.pbtxt"
|
|
replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${max_queue_size}' "0" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${batching_strategy}' 'INVALID' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP16" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${prompt_embedding_table_data_type}' 'TYPE_FP16' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt"
|
|
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt"
|
|
replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt"
|
|
replace_config_tags '${logits_datatype}' 'TYPE_FP32' "${MODEL_DIR}/ensemble/config.pbtxt"
|
|
replace_config_tags '${logits_datatype}' 'TYPE_FP32' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
|
|
# Copy the engine and place it into the model folder
|
|
cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1
|
|
|
|
# Invalid GPT model Type
|
|
SERVER_LOG="./${NUM_GPU}gpu_invalid_batch_strat.log"
|
|
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
# Expect invalid GPT model type error to be gracefully handled
|
|
if [ `grep -c "Invalid gpt_model_type" $SERVER_LOG` == "0" ]; then
|
|
echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
# inflight batching OFF (V1)
|
|
# streaming OFF
|
|
SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"
|
|
replace_config_tags 'INVALID' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
# Expect invalid GPT model type error to be gracefully handled
|
|
if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
|
|
echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
|
|
# inflight batching ON
|
|
# streaming OFF
|
|
SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log"
|
|
replace_config_tags 'V1' 'inflight_fused_batching' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
|
|
set -e
|
|
python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
|
|
--max-input-len=500 \
|
|
dataset --dataset=${DATASET} \
|
|
--tokenizer-dir=${TOKENIZER_DIR}
|
|
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
set +e
|
|
|
|
set -e
|
|
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
|
|
--max-input-len=500 \
|
|
--dataset=${DATASET}
|
|
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
set +e
|
|
|
|
# Make sure the metrics is retrieved after the server has updated the metrics internally
|
|
sleep ${SLEEP_DURATION}
|
|
curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out
|
|
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
# Start a clean server to verify token metrics are being
|
|
# reported correctly
|
|
SERVER_LOG="./${NUM_GPU}gpu_token_metrics.log"
|
|
replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
set -e
|
|
|
|
#Based on prompt below
|
|
export STREAM_INPUT_SIZE=3
|
|
export STREAM_OUTPUT_SIZE=50
|
|
python3 ${STREAM_DIR}/end_to_end_grpc_client.py \
|
|
--prompt="My name is" \
|
|
--streaming \
|
|
-o=$STREAM_OUTPUT_SIZE
|
|
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
|
|
# Make sure the metrics is retrieved after the server has updated the metrics internally
|
|
sleep ${SLEEP_DURATION}
|
|
curl localhost:8002/metrics -o end_to_end_token_metrics.out
|
|
|
|
set +e
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
replace_config_tags 'decoupled: True' 'decoupled: False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
|
|
# Start a clean server to verify base metrics are being
|
|
# reported correctly
|
|
SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_base_metrics.log"
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
set -e
|
|
|
|
set +e
|
|
BACKEND_ROOT=${BACKEND_ROOT} python3 -m unittest ${BASE_METRICS_VERIFICATION_TEST_NAME}.TRTLLMBaseMetricsTest.test_end_to_end >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
cat ${BASE_METRICS_VERIFICATION_LOG}
|
|
echo -e "\n***\n*** Error executing base metrics verification test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
RET=1
|
|
fi
|
|
set -e
|
|
|
|
set +e
|
|
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
# Start a clean server to verify base metrics are being
|
|
# reported correctly
|
|
SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_base_metrics.log"
|
|
replace_config_tags '${max_beam_width}' "2" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
set -e
|
|
|
|
set +e
|
|
BACKEND_ROOT=${BACKEND_ROOT} python3 -m unittest ${BASE_METRICS_VERIFICATION_TEST_NAME}.TRTLLMBaseMetricsTest.test_end_to_end_beam_width >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
cat ${BASE_METRICS_VERIFICATION_LOG}
|
|
echo -e "\n***\n*** Error executing base metrics verification test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
RET=1
|
|
fi
|
|
set -e
|
|
|
|
set +e
|
|
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
# World size must be 1 when using multi-model
|
|
if [ "${NUM_GPU}" == "0" ]; then
|
|
# Multi-model
|
|
SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
|
|
run_server "${SERVER_ARGS} --multi-model"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
set -e
|
|
|
|
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
|
|
--max-input-len=500 \
|
|
--dataset=${DATASET}
|
|
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
set +e
|
|
|
|
# Make sure the metrics is retrieved after the server has updated the metrics internally
|
|
sleep ${SLEEP_DURATION}
|
|
curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out
|
|
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
fi
|
|
|
|
# inflight batching ON
|
|
# streaming ON
|
|
SERVER_LOG="./${NUM_GPU}gpu_IFB_streaming_server.log"
|
|
replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
|
|
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
|
|
set -e
|
|
python3 ${STREAM_DIR}/end_to_end_grpc_client.py \
|
|
--prompt="My name is"
|
|
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
set +e
|
|
|
|
# Make sure the metrics is retrieved after the server has updated the metrics internally
|
|
sleep ${SLEEP_DURATION}
|
|
curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_stream_metrics.out
|
|
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
# Per-request metrics stats
|
|
# Use large number of tokens for KV cache reuse
|
|
echo '{"text_input": "Machine learning is a field of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It involves the use of data and algorithms to imitate the way humans learn, gradually improving its accuracy. Machine learning is used in a variety of applications such as email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers.", "max_tokens": 50, "pad_id": 2, "end_id": 2, "return_perf_metrics": true }' > tmp.txt
|
|
echo "Machine learning is a field of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It involves the use of data and algorithms to imitate the way humans learn, gradually improving its accuracy. Machine learning is used in a variety of applications such as email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers." > prompt.txt
|
|
|
|
# Test the tensorrtllm model with different backends
|
|
for TRITON_BACKEND in tensorrtllm python; do
|
|
for DECOUPLED_TRIAL in non-decoupled decoupled; do
|
|
reset_model_repo
|
|
cp -r ${BACKEND_ROOT}/all_models/inflight_batcher_llm/* ${MODEL_DIR}
|
|
# Copy the engine and place it into the model folder
|
|
cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1
|
|
ENGINE_DIR=${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/
|
|
TRITON_MAX_BATCH_SIZE=64
|
|
INSTANCE_COUNT=1
|
|
MAX_QUEUE_DELAY_MS=0
|
|
MAX_QUEUE_SIZE=0
|
|
FILL_TEMPLATE_SCRIPT=${BACKEND_ROOT}/tools/fill_template.py
|
|
if [ "${DECOUPLED_TRIAL}" == "non-decoupled" ]; then
|
|
DECOUPLED_MODE=false
|
|
else
|
|
DECOUPLED_MODE=true
|
|
fi
|
|
|
|
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
|
|
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT}
|
|
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${ENGINE_DIR},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_batching,max_queue_size:${MAX_QUEUE_SIZE},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,request_stats_max_iterations:10,exclude_input_in_output:True,enable_kv_cache_reuse:True,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,prompt_embedding_table_data_type:TYPE_FP16
|
|
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT}
|
|
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_DIR}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT},logits_datatype:TYPE_FP32,prompt_embedding_table_data_type:TYPE_FP16
|
|
|
|
for ENDPOINT in generate grpc inflight_batcher_llm; do
|
|
SERVER_LOG="./${NUM_GPU}gpu_perf_metrics_${TRITON_BACKEND}_${ENDPOINT}_${DECOUPLED_TRIAL}_server.log"
|
|
CLIENT_LOG="./${NUM_GPU}gpu_perf_metrics_${TRITON_BACKEND}_${ENDPOINT}_${DECOUPLED_TRIAL}_client.log"
|
|
run_server "${SERVER_ARGS}"
|
|
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
|
|
for ITER in 1 2; do
|
|
if [ "$ITER" == "1" ]; then
|
|
EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS=4
|
|
EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS=4
|
|
EXPECTED_KV_CACHE_REUSED_BLOCKS=0
|
|
else
|
|
EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS=1
|
|
EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS=1
|
|
EXPECTED_KV_CACHE_REUSED_BLOCKS=4
|
|
fi
|
|
EXPECTED_ACCEPTANCE_RATE=0.0
|
|
EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS=0
|
|
EXPECTED_TOTAL_DRAFT_TOKENS=0
|
|
|
|
if [ "$WAIT_RET" != "0" ]; then
|
|
# Cleanup
|
|
kill $SERVER_PID > /dev/null 2>&1 || true
|
|
echo -e "\n***\n*** Failed to start $SERVER\n***"
|
|
cat $SERVER_LOG
|
|
exit 1
|
|
fi
|
|
|
|
if [ "$ENDPOINT" == "generate" ]; then
|
|
# Generate endpoint
|
|
# Test with both ensemble and tensorrt_llm_bls models
|
|
if [ "$ITER" == "1" ]; then
|
|
MODEL="ensemble"
|
|
else
|
|
MODEL="tensorrt_llm_bls"
|
|
fi
|
|
|
|
set +e
|
|
if [ "${DECOUPLED_TRIAL}" == "non-decoupled" ]; then
|
|
code=`curl -s -w %{http_code} -o ./curl.out -d @tmp.txt localhost:8000/v2/models/${MODEL}/generate`
|
|
else
|
|
# Remove the "data:" prefix from the response to avoid parsing issues
|
|
code=$(curl -s -w %{http_code} -o ./curl.out -d @tmp.txt localhost:8000/v2/models/${MODEL}/generate_stream && sed -i 's/^data: //' ./curl.out)
|
|
fi
|
|
|
|
if [ "$code" != "200" ]; then
|
|
cat ./curl.out
|
|
echo -e "\n***\n*** Test Failed\n***"
|
|
RET=1
|
|
fi
|
|
set -e
|
|
|
|
kv_cache_alloc_new_blocks=$(jq '.kv_cache_alloc_new_blocks' curl.out)
|
|
kv_cache_alloc_total_blocks=$(jq '.kv_cache_alloc_total_blocks' curl.out)
|
|
kv_cache_reused_blocks=$(jq '.kv_cache_reused_blocks' curl.out)
|
|
arrival_time_ns=$(jq '.arrival_time_ns' curl.out)
|
|
first_scheduled_time_ns=$(jq '.first_scheduled_time_ns' curl.out)
|
|
first_token_time_ns=$(jq '.first_token_time_ns' curl.out)
|
|
last_token_time_ns=$(jq '.last_token_time_ns' curl.out)
|
|
acceptance_rate=$(jq '.acceptance_rate' curl.out)
|
|
total_accepted_draft_tokens=$(jq '.total_accepted_draft_tokens' curl.out)
|
|
total_draft_tokens=$(jq '.total_draft_tokens' curl.out)
|
|
else
|
|
STREAMING_FLAG=""
|
|
if [ "${DECOUPLED_TRIAL}" == "decoupled" ]; then
|
|
STREAMING_FLAG="--streaming"
|
|
fi
|
|
if [ "$ENDPOINT" == "grpc" ]; then
|
|
set +e
|
|
python3 ${STREAM_DIR}/end_to_end_grpc_client.py -v --prompt="$(cat prompt.txt)" --return-perf-metrics ${STREAMING_FLAG} > ${CLIENT_LOG} 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing end_to_end_grpc_client.py with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
set -e
|
|
elif [ "$ENDPOINT" == "inflight_batcher_llm" ]; then
|
|
set +e
|
|
python3 ${STREAM_DIR}/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${TOKENIZER_DIR} \
|
|
--return-perf-metrics --text "$(cat prompt.txt)" ${STREAMING_FLAG} > ${CLIENT_LOG} 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
cat $SERVER_LOG
|
|
echo -e "\n***\n*** Error executing inflight_batcher_llm_client.py with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
RET=1
|
|
fi
|
|
set -e
|
|
fi
|
|
kv_cache_alloc_new_blocks=$(grep "kv_cache_alloc_new_blocks" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
kv_cache_alloc_total_blocks=$(grep "kv_cache_alloc_total_blocks" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
kv_cache_reused_blocks=$(grep "kv_cache_reused_blocks" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
arrival_time_ns=$(grep "arrival_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
first_scheduled_time_ns=$(grep "first_scheduled_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
first_token_time_ns=$(grep "first_token_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
last_token_time_ns=$(grep "last_token_time_ns" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
acceptance_rate=$(grep "acceptance_rate" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
total_accepted_draft_tokens=$(grep "total_accepted_draft_tokens" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
total_draft_tokens=$(grep "total_draft_tokens" ${CLIENT_LOG} | head -n 1 | awk '{print $2}')
|
|
fi
|
|
|
|
if [[ "$kv_cache_alloc_new_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS" || \
|
|
"$kv_cache_alloc_total_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS" || \
|
|
"$kv_cache_reused_blocks" -ne "$EXPECTED_KV_CACHE_REUSED_BLOCKS" || \
|
|
"$acceptance_rate" != "$EXPECTED_ACCEPTANCE_RATE" || \
|
|
"$total_accepted_draft_tokens" -ne "$EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS" || \
|
|
"$total_draft_tokens" -ne "$EXPECTED_TOTAL_DRAFT_TOKENS" ]]; then
|
|
echo "Test failed for ${ENDPOINT} with ${NUM_GPU}GPU(s):"
|
|
[[ "$kv_cache_alloc_new_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS" ]] && \
|
|
echo " kv_cache_alloc_new_blocks: expected $EXPECTED_KV_CACHE_ALLOC_NEW_BLOCKS, got $kv_cache_alloc_new_blocks"
|
|
[[ "$kv_cache_alloc_total_blocks" -ne "$EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS" ]] && \
|
|
echo " kv_cache_alloc_total_blocks: expected $EXPECTED_KV_CACHE_ALLOC_TOTAL_BLOCKS, got $kv_cache_alloc_total_blocks"
|
|
[[ "$kv_cache_reused_blocks" -ne "$EXPECTED_KV_CACHE_REUSED_BLOCKS" ]] && \
|
|
echo " kv_cache_reused_blocks: expected $EXPECTED_KV_CACHE_REUSED_BLOCKS, got $kv_cache_reused_blocks"
|
|
[[ "$acceptance_rate" != "$EXPECTED_ACCEPTANCE_RATE" ]] && \
|
|
echo " acceptance_rate: expected $EXPECTED_ACCEPTANCE_RATE, got $acceptance_rate"
|
|
[[ "$total_accepted_draft_tokens" -ne "$EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS" ]] && \
|
|
echo " total_accepted_draft_tokens: expected $EXPECTED_TOTAL_ACCEPTED_DRAFT_TOKENS, got $total_accepted_draft_tokens"
|
|
[[ "$total_draft_tokens" -ne "$EXPECTED_TOTAL_DRAFT_TOKENS" ]] && \
|
|
echo " total_draft_tokens: expected $EXPECTED_TOTAL_DRAFT_TOKENS, got $total_draft_tokens"
|
|
RET=1
|
|
fi
|
|
|
|
if ! [[ $arrival_time_ns =~ ^-?[0-9]+$ ]] || [ $arrival_time_ns -eq 0 ]; then
|
|
echo "Arrival time $arrival_time_ns is not valid, expected positive integer value"
|
|
RET=1
|
|
fi
|
|
if ! [[ $first_scheduled_time_ns =~ ^-?[0-9]+$ ]] || [ $first_scheduled_time_ns -eq 0 ]; then
|
|
echo "First scheduled time $first_scheduled_time_ns is not valid, expected positive integer value"
|
|
RET=1
|
|
fi
|
|
if ! [[ $first_token_time_ns =~ ^-?[0-9]+$ ]] || [ $first_token_time_ns -eq 0 ]; then
|
|
echo "First token time $first_token_time_ns is not valid, expected positive integer value"
|
|
RET=1
|
|
fi
|
|
if [ "${DECOUPLED_TRIAL}" == "decoupled" ]; then
|
|
# Allow 0 for streaming mode
|
|
if ! [[ $last_token_time_ns =~ ^-?[0-9]+$ ]]; then
|
|
echo "Last token time: expected 0; got $last_token_time_ns"
|
|
RET=1
|
|
fi
|
|
else
|
|
if ! [[ $last_token_time_ns =~ ^-?[0-9]+$ ]] || [ $last_token_time_ns -eq 0 ]; then
|
|
echo "Last token time $last_token_time_ns is not valid, expected positive integer value"
|
|
RET=1
|
|
fi
|
|
fi
|
|
done
|
|
kill_server
|
|
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
|
|
# Add a delay to make sure the memory is freed before starting the next test
|
|
sleep 10
|
|
done
|
|
done
|
|
done
|
|
done
|
|
|
|
|
|
set +e
|
|
# Verify TRT LLM statistics are being properly reported as custom metrics
|
|
python3 ${CUSTOM_METRICS_VERIFICATION_TEST} >> ${CUSTOM_METRICS_VERIFICATION_LOG} 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
cat ${CUSTOM_METRICS_VERIFICATION_LOG}
|
|
RET=1
|
|
fi
|
|
set -e
|
|
|
|
if [ $RET -eq 0 ]; then
|
|
echo -e "\n***\n*** Test Passed\n***"
|
|
else
|
|
echo -e "\n***\n*** Test FAILED\n***"
|
|
fi
|
|
|
|
exit $RET
|