TensorRT-LLMs/tests/integration/defs/triton_server/test.sh
Dimitrios Bariamis f49dafe0da
[https://nvbugs/5394409][feat] Support Mistral Small 3.1 multimodal in Triton Backend (#6714)
Signed-off-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Signed-off-by: Dimitrios Bariamis <dbari@users.noreply.github.com>
Co-authored-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Co-authored-by: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
2025-08-21 18:08:38 +02:00

1693 lines
69 KiB
Bash
Executable File

#!/usr/bin/bash
MODEL=$1
DECODER_ENGINE_PATH=$2
TOKENIZER_PATH=$3
TOKENIZER_TYPE=$4
DRAFT_ENGINE_PATH=$5
TARGET_ENGINE_PATH=$6
ENCODER_ENGINE_PATH=$7
MULTIMODAL_ENGINE_PATH=$8
set -ex
set -o pipefail
nvidia-smi
pushd $LLM_BACKEND_ROOT
source tools/utils.sh
TRITON_REPO="triton_repo"
kill_triton_server () {
pkill -9 -f trtllmExecutorWorker || true
pkill -9 -f tritonserver
}
# Kill titonserver if it is still pending from previous test
kill_triton_server || true
if [ "$MODEL" = "mistral" ] || [ "$MODEL" = "mistral-ib" ] || [ "$MODEL" = "mistral-ib-mm" ]; then
MAX_ATTENTION_WINDOW_SIZE="2048"
MAX_SEQUENCE_LEN="8704" # max_input_len + max_output_len
elif [ "$MODEL" = "t5-ib" ] || [ "$MODEL" = "bart-ib" ]; then
MAX_ATTENTION_WINDOW_SIZE=""
MAX_SEQUENCE_LEN="4096" # for enc-dec, choose a sufficient size of max token in kv cache to avoid no free block error
elif [ "$MODEL" = "whisper" ]; then
MAX_ATTENTION_WINDOW_SIZE=""
MAX_SEQUENCE_LEN="24000" # WAR to avoid no free block errors
else
MAX_ATTENTION_WINDOW_SIZE=""
MAX_SEQUENCE_LEN="2048"
fi
if [ "$MODEL" = "mllama" ]; then
ENCODER_INPUT_FEATURES_DTYPE="TYPE_BF16"
else
ENCODER_INPUT_FEATURES_DTYPE="TYPE_FP16"
fi
if [ "$MODEL" = "gpt" ] || [ "$MODEL" = "opt" ] || [ "$MODEL" = "llama" ] || [ "$MODEL" = "gptj" ] || [ "$MODEL" = "mistral" ]; then
rm -rf ${TRITON_REPO}
cp -R all_models/gpt ${TRITON_REPO}
# Modify config.pbtxt
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt engine_dir:${DECODER_ENGINE_PATH}
python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH}
python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH}
# Launch Triton Server
mpirun --allow-run-as-root \
-n 1 /opt/tritonserver/bin/tritonserver \
--model-repository=${TRITON_REPO} \
--disable-auto-complete-config \
--backend-config=python,shm-region-prefix-name=prefix0_ : &
export SERVER_PID=$!
wait_for_server_ready ${SERVER_PID} 1200 ${TRITON_HTTP_PORT}
pushd tools/gpt/
# Client
python3 client.py \
--text="Born in north-east France, Soyer trained as a" \
--output_len=10 \
--protocol=http \
--tokenizer_dir ${TOKENIZER_PATH}
python3 client.py \
--text="Born in north-east France, Soyer trained as a" \
--output_len=10 \
--protocol=grpc \
--tokenizer_dir ${TOKENIZER_PATH}
# Async Client
python3 client_async.py \
--text="Born in north-east France, Soyer trained as a" \
--output_len=10 \
--protocol=http \
--tokenizer_dir ${TOKENIZER_PATH}
python3 client_async.py \
--text="Born in north-east France, Soyer trained as a" \
--output_len=10 \
--protocol=grpc \
--tokenizer_dir ${TOKENIZER_PATH}
# End to end test
python3 end_to_end_test.py \
--tokenizer_dir ${TOKENIZER_PATH}
# Benchmark Core Model
python3 benchmark_core_model.py \
--batch_size=8 --start_len=128 --output_len=20 \
--protocol=http --mode=sync
python3 benchmark_core_model.py \
--batch_size=8 --start_len=128 --output_len=20 \
--protocol=grpc --mode=sync
python3 benchmark_core_model.py \
--batch_size=8 --start_len=128 --output_len=20 \
--protocol=http --mode=async
python3 benchmark_core_model.py \
--batch_size=8 --start_len=128 --output_len=20 \
--protocol=grpc --mode=async
# Benchmark using Perf Analyzer
python3 gen_input_data.py
# FIXME(kaiyu): Uncomment this when perf_analyzer is available.
# perf_analyzer -m tensorrt_llm -v \
# -b 8 --input-data input_data.json \
# --concurrency-range 2 \
# -i http \
# -u 'localhost:8000'
# perf_analyzer -m tensorrt_llm -v \
# -b 8 --input-data input_data.json \
# --concurrency-range 2 \
# -i grpc \
# -u 'localhost:8001'
kill ${SERVER_PID}
popd # tools/gpt
fi
print_test_params () {
echo "----------------------------------"
echo " Test parameters:"
echo "----------------------------------"
echo "BACKEND: ${BACKEND}"
echo "BATCHING_STRATEGY: ${BATCHING_STRATEGY}"
echo "MAX_TOKENS_IN_KV_CACHE: ${MAX_TOKENS_IN_KV_CACHE}"
echo "MAX_ATTENTION_WINDOW_SIZE: ${MAX_ATTENTION_WINDOW_SIZE}"
echo "BATCH_SCHEDULER_POLICY: ${BATCH_SCHEDULER_POLICY}"
echo "KV_CACHE_FREE_GPU_MEM_FRACTION: ${KV_CACHE_FREE_GPU_MEM_FRACTION}"
echo "CROSS_KV_CACHE_FRACTION: ${CROSS_KV_CACHE_FRACTION}"
echo "EXCLUDE_INPUT_IN_OUTPUT: ${EXCLUDE_INPUT_IN_OUTPUT}"
echo "TRITON_MAX_BATCH_SIZE: ${TRITON_MAX_BATCH_SIZE}"
echo "MAX_QUEUE_DELAY_MICROSECONDS: ${MAX_QUEUE_DELAY_MICROSECONDS}"
echo "MAX_BEAM_WIDTH: ${MAX_BEAM_WIDTH}"
echo "ENABLE_KV_CACHE_REUSE: ${ENABLE_KV_CACHE_REUSE}"
echo "E2E_MODEL_NAME: ${E2E_MODEL_NAME}"
echo "TENSORRT_LLM_MODEL_NAME: ${TENSORRT_LLM_MODEL_NAME}"
echo "TENSORRT_LLM_TARGET_MODEL_NAME: ${TENSORRT_LLM_TARGET_MODEL_NAME}"
echo "TENSORRT_LLM_DRAFT_MODEL_NAME: ${TENSORRT_LLM_DRAFT_MODEL_NAME}"
echo "ACCUMULATE_TOKEN: ${ACCUMULATE_TOKEN}"
echo "BLS_INSTANCE_COUNT: ${BLS_INSTANCE_COUNT}"
echo "PREPROCESSING_INSTANCE_COUNT: ${PREPROCESSING_INSTANCE_COUNT}"
echo "POSTPROCESSING_INSTANCE_COUNT: ${POSTPROCESSING_INSTANCE_COUNT}"
echo "NORMALIZE_LOG_PROBS: ${NORMALIZE_LOG_PROBS}"
echo "ENABLE_CHUNKED_CONTEXT: ${ENABLE_CHUNKED_CONTEXT}"
echo "GPU_DEVICE_IDS: ${GPU_DEVICE_IDS}"
echo "DECODING_MODE: ${DECODING_MODE}"
echo "MAX_QUEUE_SIZE: ${MAX_QUEUE_SIZE}"
echo "ENABLE_CONTEXT_FMHA_FP32_ACC: ${ENABLE_CONTEXT_FMHA_FP32_ACC}"
echo "PROMPT_EMBEDDING_TABLE_DTYPE: ${PROMPT_EMBEDDING_TABLE_DTYPE}"
echo "run_all_tests: ${run_all_tests}"
echo "----------------------------------"
}
fill_triton_repo () {
if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ]; then
cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft
sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt
fi
if [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then
cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_target
sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_target"/g' ${TRITON_REPO}/tensorrt_llm_target/config.pbtxt
fi
echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm with engine ${DECODER_ENGINE_PATH}"
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32,lookahead_window_size:${LOOKAHEAD_WINDOW_SIZE},lookahead_ngram_size:${LOOKAHEAD_NGRAM_SIZE},lookahead_verification_set_size:${LOOKAHEAD_VERIFICATION_SET_SIZE}
python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT}
python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT}
python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ] && [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_TARGET_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE}
else
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:"",prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE}
fi
if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ]; then
echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm_draft with engine ${DRAFT_ENGINE_PATH}"
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32
fi
if [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then
echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm_target with engine ${TARGET_ENGINE_PATH}"
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_target/config.pbtxt triton_backend:${BACKEND},engine_dir:${TARGET_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:true,normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32
fi
# encoder-decoder model only
if [ "${CROSS_KV_CACHE_FRACTION}" != "" ]; then
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt cross_kv_cache_fraction:${CROSS_KV_CACHE_FRACTION}
fi
if [ "${ENCODER_ENGINE_PATH}" != "" ] && [ "${ENCODER_ENGINE_PATH}" != "skip" ]; then
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt encoder_engine_dir:${ENCODER_ENGINE_PATH}
fi
if [ "${MULTIMODAL_ENGINE_PATH}" != "" ] && [ "${MULTIMODAL_ENGINE_PATH}" != "skip" ]; then
cp all_models/multimodal/ensemble ${TRITON_REPO} -r
cp all_models/multimodal/multimodal_encoders ${TRITON_REPO} -r
python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${DECODER_ENGINE_PATH}
python3 tools/fill_template.py -i ${TRITON_REPO}/multimodal_encoders/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},hf_model_path:${TOKENIZER_PATH}
python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt multimodal_encoders_name:multimodal_encoders
fi
if [ "$MODEL" = "whisper" ]; then
cp all_models/whisper/whisper_bls ${TRITON_REPO} -r
rm -r ${TRITON_REPO}/preprocessing ${TRITON_REPO}/postprocessing ${TRITON_REPO}/ensemble ${TRITON_REPO}/tensorrt_llm_bls
python3 tools/fill_template.py -i ${TRITON_REPO}/whisper_bls/config.pbtxt engine_dir:${ENCODER_ENGINE_PATH},n_mels:128,zero_pad:false,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE}
wget -nc --directory-prefix=${TRITON_REPO}/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
wget -nc --directory-prefix=${TRITON_REPO}/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
fi
if [ "$MODEL" = "gpt-disaggregated-serving-bls" ]; then
cp -r ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/generation
mv ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/context
cp -r all_models/disaggregated_serving/disaggregated_serving_bls/ ${TRITON_REPO}
python3 tools/fill_template.py -i ${TRITON_REPO}/disaggregated_serving_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},disaggregated_serving_bls_count:${BLS_INSTANCE_COUNT},context_model_name:context,generation_model_name:generation,logits_datatype:TYPE_FP32
mv ${TRITON_REPO}/disaggregated_serving_bls ${TRITON_REPO}/tensorrt_llm
sed 's/name: "disaggregated_serving_bls"/name: "tensorrt_llm"/' -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt
sed 's/name: "tensorrt_llm"/name: "context"/' -i ${TRITON_REPO}/context/config.pbtxt
sed 's/name: "tensorrt_llm"/name: "generation"/' -i ${TRITON_REPO}/generation/config.pbtxt
fi
}
launch_triton_server () {
print_test_params
rm -rf ${TRITON_REPO}
cp -R all_models/inflight_batcher_llm ${TRITON_REPO}
fill_triton_repo
# Launch Triton Server
/opt/tritonserver/bin/tritonserver \
--disable-auto-complete-config --model-repository=${TRITON_REPO} --http-port ${TRITON_HTTP_PORT} --grpc-port ${TRITON_GRPC_PORT} --metrics-port ${TRITON_METRICS_PORT} > log.txt 2>&1 &
export SERVER_PID=$!
wait_for_server_ready ${SERVER_PID} 1200 ${TRITON_HTTP_PORT}
}
test_stop_words() {
# test to run for all combinations of flags
EXCL_INPUT_IN_OUTPUT_FLAG=""
[ "${EXCLUDE_INPUT_IN_OUTPUT}" = "true" ] && EXCL_INPUT_IN_OUTPUT_FLAG="--exclude-input-in-output"
STREAMING_FLAG=""
[ "${STREAMING}" = "true" ] && STREAMING_FLAG="--streaming"
BEAM_FLAG="--beam-width 1"
[ "${DECODING_MODE}" = "beam_search" ] && BEAM_FLAG="--beam-width 2"
# Test client
pushd inflight_batcher_llm/client
if [[ $MODEL = "gpt-ib" ]]; then
PROMPT="The only thing we have to fear is"
OUTLEN=10
ORIGINAL_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" 2>&1 | tail -n 1)
echo "original output"
echo $ORIGINAL_OUTPUT
# should be something like "[...] that the government will [...]"
# examples of stop words that won't affect generation
# "government" isn't tokenized like " government"
# " that the public" doesn't match entirely the generated string
TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words "government" " that the public" 2>&1 | tail -n 1)
[[ "${ORIGINAL_OUTPUT}" == "${TEST_OUTPUT}" ]]
# check that output finishes at "government"
TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words " lorem" " government" 2>&1 | tail -n 1)
[[ "${TEST_OUTPUT}" == *"government" ]]
TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words " that the government" 2>&1 | tail -n 1)
[[ "${TEST_OUTPUT}" == *"government" ]]
else
PROMPT="What does Jonathan mean?"
OUTLEN=10
# Only the BLS backend supports stop word detection on word level.
# The word Jonathan has multiple tokenizations which is not detected in TRT-LLM.
TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words "Jonathan" --model-name "tensorrt_llm_bls" $BEAM_FLAG 2>&1 | tail -n 1)
fi
popd
}
run_cpp_trtllm_backend_tests () {
# test to run for all combinations of flags
EXCL_INPUT_IN_OUTPUT_FLAG=""
[ "${EXCLUDE_INPUT_IN_OUTPUT}" = "true" ] && EXCL_INPUT_IN_OUTPUT_FLAG="--exclude-input-in-output"
STREAMING_FLAG=""
[ "${STREAMING}" = "true" ] && STREAMING_FLAG="--streaming"
# Test client
pushd inflight_batcher_llm/client
if [ $MAX_ATTENTION_WINDOW_SIZE ]; then
# test using a longer input
# TODO: Once we switch to using real weights, add `--check-output` arg
python3 inflight_batcher_llm_client.py \
${STREAMING_FLAG} \
--tokenizer-dir ${TOKENIZER_PATH} \
--input-tokens-csv='../../tools/dataset/long_input.csv' \
--output-tokens-csv='../../tools/dataset/long_output.csv' \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
2>&1 | tee output_long_input
# If no prompt in output, check that output sequence isn't an empty list of tokens
if $EXCL_INPUT_IN_OUTPUT_FLAG; then
grep -o "Output sequence starts with: \[1, 3189, 28809, 28707, 7234, 574, 3441, 1236, 28723, 28705" output_long_input
else
grep -o "Output sequence\( starts with\)\?:\s*\[\([0-9]*\,\?\s\?\)*\]" output_long_input
fi
fi
# testing output accuracy for real weights only
CHECK_OUTPUT_FLAG=""
if [ $MODEL = "gpt-ib" ]; then
CHECK_OUTPUT_FLAG="--check-output"
fi
python3 inflight_batcher_llm_client.py \
${STREAMING_FLAG} \
${CHECK_OUTPUT_FLAG} \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
--tokenizer-dir ${TOKENIZER_PATH}
#Check that metrics work as expected by looking at number of successful requests for tensorrt_llm
num_success=$(curl localhost:${TRITON_METRICS_PORT}/metrics 2>&1 | grep nv_inference_request_success\{model=\"tensorrt_llm\" | cut -d " " -f 2)
if (( num_success <= 0 )); then
exit 1
else
echo "Number of successful requests: $num_success"
fi
if [[ "$run_all_tests" == "true" && "$BATCHING_STRATEGY" == "inflight_fused_batching" ]]; then
# testing output accuracy for real weights only
if [[ $MODEL = "gpt-ib" ]] || [[ $MODEL = "mistral-ib-streaming" ]]; then
popd
test_stop_words
pushd inflight_batcher_llm/client
fi
# Stop request
python3 inflight_batcher_llm_client.py \
${STREAMING_FLAG} \
--request-output-len=128 \
--stop-after-ms 100 \
--tokenizer-dir ${TOKENIZER_PATH} \
--request-id 1 \
2>&1 | tee output_w_stop
grep "Got cancellation response" output_w_stop
if [[ "${STREAMING}" == "true" ]]; then
# Request cancellation
python3 inflight_batcher_llm_client.py \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
--streaming \
--request-output-len=128 \
--stop-after-ms 100 \
--request-id 1 \
--stop-via-request-cancel \
--tokenizer-dir ${TOKENIZER_PATH} 2>&1 | tee output_w_stop
grep "Request is cancelled" output_w_stop
fi
if [[ -n "${1}" && -n "${2}" && -n "${3}" ]]; then
python3 inflight_batcher_llm_client.py \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
${STREAMING_FLAG} \
--request-output-len=128 \
--end-id $3 \
--request-id 1 \
--tokenizer-dir ${TOKENIZER_PATH} \
--input-tokens-csv=$1 \
--output-tokens-csv=$2 \
--check-output
fi
#test with return log probs
python3 inflight_batcher_llm_client.py \
${STREAMING_FLAG} \
--request-output-len=10 \
--tokenizer-dir ${TOKENIZER_PATH} \
--return-log-probs --top-k 2 \
2>&1 | tee output_log_probs
#test with string request id
python3 inflight_batcher_llm_client.py \
${STREAMING_FLAG} \
${CHECK_OUTPUT_FLAG} \
--tokenizer-dir ${TOKENIZER_PATH} \
--request-id my_request 2>&1 | tee output_str_request
# n-return requires the decoupled mode.
if [[ "${DECOUPLED_MODE}" == "True" ]]; then
#test with n returns
python3 inflight_batcher_llm_client.py \
${STREAMING_FLAG} \
${CHECK_OUTPUT_FLAG} \
--tokenizer-dir ${TOKENIZER_PATH} \
--num-return-sequences 2 2>&1 | tee output_n_return
fi
# Test triton metrics are present and have non-zero values (when applicable).
TRITON_METRICS_LOG="triton_metrics.out"
curl localhost:${TRITON_METRICS_PORT}/metrics -o ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="context",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="scheduled",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="max",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="active",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="waiting",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_runtime_memory_metrics\{memory_type="pinned",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_runtime_memory_metrics\{memory_type="gpu",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_runtime_memory_metrics\{memory_type="cpu",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="used",model="tensorrt_llm",version="1"\} [0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="free",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="max",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="fraction",model="tensorrt_llm",version="1"\} [0-9]*\.?[0-9]+$' ${TRITON_METRICS_LOG}
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
grep -E 'nv_trt_llm_inflight_batcher_metrics\{model="tensorrt_llm",v1_specific_metric="num_ctx_tokens",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_inflight_batcher_metrics\{model="tensorrt_llm",v1_specific_metric="num_gen_tokens",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_inflight_batcher_metrics\{model="tensorrt_llm",v1_specific_metric="empty_gen_slots",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
else
grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="paused_requests",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="micro_batch_id",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="generation_requests",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="total_context_tokens",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG}
fi
grep -E 'nv_trt_llm_general_metrics\{general_type="iteration_counter",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
grep -E 'nv_trt_llm_general_metrics\{general_type="timestamp",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG}
rm ${TRITON_METRICS_LOG}
fi
popd # inflight_batcher_llm/client
# End to end test
pushd tools/inflight_batcher_llm
# HTTP client cannot be used with decoupled mode.
if [[ "${DECOUPLED_MODE}" == "False" ]]; then
python3 benchmark_core_model.py \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
--concurrency 8 \
-i http \
--max-input-len 300 \
dataset \
--dataset ../dataset/mini_cnn_eval.json \
--tokenizer-dir ${TOKENIZER_PATH}
fi
if [[ "$run_all_tests" == "true" ]]; then
# Note: streaming flag is not set to 1 for these benchmarks regardless
# of the value of $STREAMING.
DECOUPLED_FLAG=""
[ "${DECOUPLED_MODE}" = "True" ] && DECOUPLED_FLAG="--decoupled"
python3 benchmark_core_model.py \
${DECOUPLED_FLAG} \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
--concurrency 8 \
-i grpc \
--max-input-len 300 \
--num-requests 80 \
dataset \
--dataset ../dataset/mini_cnn_eval.json \
--tokenizer-dir ${TOKENIZER_PATH}
# Performance check.
python3 benchmark_core_model.py \
${DECOUPLED_FLAG} \
${CHECK_PERF_JSON_ARGS} \
--check-perf-key ${MODEL}-${BACKEND} \
--check-perf-rtol 0.05 \
--check-perf-atol 50 \
--concurrency 8 \
-i grpc \
--max-input-len 300 \
--request-rate -1 \
--num-requests 1000 \
token-norm-dist \
--input-mean 128 --input-stdev 0 \
--output-mean 20 --output-stdev 0
python3 benchmark_core_model.py \
${DECOUPLED_FLAG} \
-i grpc --max-input-len 1000 \
--request-rate -1 \
token-from-histogram --histogram-key example
fi
popd # tools/inflight_batcher_llm
}
run_cpp_e2e_backend_tests () {
STREAMING_FLAG=""
[ "${STREAMING}" = "true" ] && STREAMING_FLAG="--streaming"
OVERWRITE_OUTPUT_TEXT_FLAG=""
[ "${ACCUMULATE_TOKEN}" = "true" ] && OVERWRITE_OUTPUT_TEXT_FLAG="--overwrite-output-text"
EXCL_INPUT_IN_OUTPUT_FLAG=""
[ "${EXCLUDE_INPUT_IN_OUTPUT}" = "true" ] && EXCL_INPUT_IN_OUTPUT_FLAG="--exclude-input-in-output"
pushd inflight_batcher_llm/client
# testing output accuracy for real weights only
if [[ $MODEL = "gpt-ib" || $MODEL = "gpt-ib-streaming" ]]; then
python3 end_to_end_grpc_client.py \
${STREAMING_FLAG} \
--output-len 10 --prompt "The only thing we have to fear is" \
${OVERWRITE_OUTPUT_TEXT_FLAG} \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
--model-name "$E2E_MODEL_NAME" | tee output_e2e
grep "that the government will" output_e2e
if [[ "$EXCL_INPUT_IN_OUTPUT_FLAG" != "" ]]; then
grep -v "The only thing we have to fear is" output_e2e
fi
if [[ "$run_all_tests" == "true" && "$BATCHING_STRATEGY" == "inflight_fused_batching" ]]; then
# test with embedding bias
python3 end_to_end_grpc_client.py \
${STREAMING_FLAG} \
${OVERWRITE_OUTPUT_TEXT_FLAG} \
-o 10 \
-p "The only thing we have to fear is" \
--embedding-bias-words " government" \
--embedding-bias-weights -20 \
--model-name "$E2E_MODEL_NAME" \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
2>&1 | tee output_w_bias
grep -v "that the government will" output_w_bias
if [[ "$EXCL_INPUT_IN_OUTPUT_FLAG" != "" ]]; then
grep -v "The only thing we have to fear is" output_e2e
fi
#Only run batched test in streaming for now since it requires decoupled mode
if [[ "$DECOUPLED_MODE" == "true" ]]; then
# test with batched requests
python3 end_to_end_grpc_client.py \
${STREAMING_FLAG} \
${OVERWRITE_OUTPUT_TEXT_FLAG} \
${EXCL_INPUT_IN_OUTPUT_FLAG} \
-o 5 \
--model-name "$E2E_MODEL_NAME" \
-p '["This is a test","I want you to","The cat is"]' \
--batch-inputs --check-outputs --expected-outputs '[" of the power of the"," know that I am not"," a very good cat."]'
fi
fi
fi
popd # inflight_batcher_llm/client
# End to end test
pushd tools/inflight_batcher_llm
# end_to_end_test.py doesn't support streaming
if [[ "${STREAMING}" == "false" ]]; then
python3 end_to_end_test.py \
--concurrency 8 \
-i http \
--max-input-len 200 \
--test-bls \
--dataset ../dataset/mini_cnn_eval.json
if [[ "$run_all_tests" == "true" ]]; then
python3 end_to_end_test.py \
--concurrency 8 \
-i grpc \
--max-input-len 200 \
--test-bls \
--dataset ../dataset/mini_cnn_eval.json
fi
fi
popd # tools/inflight_batcher_llm
}
run_cpp_trtllm_queue_size_tests () {
# Test client
echo "25229,291,7379,251522,39854,5754,251514,315,32906,14297,398,261" > input.csv
pushd tools/inflight_batcher_llm
EXTRA_FLAGS=""
if [[ "${DECOUPLED_MODE}" == "True" ]]; then
EXTRA_FLAGS="-p grpc -u localhost:8001"
fi
python3 test_max_queue_size.py --input-tokens-csv ../../input.csv --request-output-len 256 --num-requests 100 ${EXTRA_FLAGS}
popd # tools/inflight_batcher_llm
}
BACKENDS=( "tensorrtllm" "python" )
BATCHING_STRATEGIES=( "inflight_fused_batching" )
MAX_TOKENS_IN_KV_CACHES=( "" $MAX_SEQUENCE_LEN )
BATCH_SCHEDULER_POLICIES=( "guaranteed_no_evict" "max_utilization" )
KV_CACHE_FREE_GPU_MEM_FRACTIONS=( "0.2" "" )
CROSS_KV_CACHE_FRACTION=""
ENABLE_CHUNKED_CONTEXTS=( "false" "true" )
BACKEND="tensorrtllm"
TRITON_MAX_BATCH_SIZE="128"
MAX_QUEUE_DELAY_MICROSECONDS="0"
MAX_BEAM_WIDTH="1"
ENABLE_KV_CACHE_REUSE="false"
E2E_MODEL_NAME="ensemble"
TENSORRT_LLM_MODEL_NAME="tensorrt_llm"
TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft"
TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm_target"
ACCUMULATE_TOKEN="false"
EXCLUDE_INPUT_IN_OUTPUT="false"
BLS_INSTANCE_COUNT="1"
PREPROCESSING_INSTANCE_COUNT="1"
POSTPROCESSING_INSTANCE_COUNT="1"
NORMALIZE_LOG_PROBS="true"
TRITON_HTTP_PORT="8000"
TRITON_GRPC_PORT="8001"
TRITON_METRICS_PORT="8002"
GPU_DEVICE_IDS=""
DECODING_MODE="top_k_top_p"
MAX_QUEUE_SIZE="0"
PROMPT_EMBEDDING_TABLE_DTYPE="TYPE_FP16"
if [ "$MODEL" = "gpt-ib" ] || [ "$MODEL" = "mistral-ib" ] || [ "$MODEL" = "mistral-ib-mm" ]; then
# Non-streaming tests, decoupled is false
DECOUPLED_MODE="False"
STREAMING="false"
# -------------------------------
# Param sweep test
# -------------------------------
run_all_tests="true"
for BACKEND in "${BACKENDS[@]}"; do
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
for MAX_TOKENS_IN_KV_CACHE in "${MAX_TOKENS_IN_KV_CACHES[@]}"; do
for BATCH_SCHEDULER_POLICY in "${BATCH_SCHEDULER_POLICIES[@]}"; do
for KV_CACHE_FREE_GPU_MEM_FRACTION in "${KV_CACHE_FREE_GPU_MEM_FRACTIONS[@]}"; do
for ENABLE_CHUNKED_CONTEXT in "${ENABLE_CHUNKED_CONTEXTS[@]}"; do
# Because the runners are shared, the default value of 0.9 doesn't work, so skip
# if max_tokens_in_kv_cache is also empty
if [[ "${KV_CACHE_FREE_GPU_MEM_FRACTION}" == "" && "${MAX_TOKENS_IN_KV_CACHE}" == "" ]]; then
continue
fi
if [[ "${BATCHING_STRATEGY}" == "v1" && "${BATCH_SCHEDULER_POLICY}" == "max_utilization" ]]; then
continue
fi
# For V1, batchScheduler currently cannot properly estimate kvCache usage
if [[ "${BATCHING_STRATEGY}" == "v1" && "${MAX_TOKENS_IN_KV_CACHE}" != "" ]]; then
continue
fi
# mistral is built without chunked context support
if [[ "$MODEL" = "mistral-ib" && "${ENABLE_CHUNKED_CONTEXT}" == "true" ]]; then
continue
fi
if [[ "$MODEL" = "mistral-ib-mm" && "${ENABLE_CHUNKED_CONTEXT}" == "true" ]]; then
continue
fi
if [[ "$MODEL" = "mistral-ib-mm" ]]; then
export TRTLLM_ORCHESTRATOR=1
fi
launch_triton_server
run_cpp_trtllm_backend_tests
run_cpp_e2e_backend_tests
kill_triton_server
run_all_tests="false"
done
done
done
done
done
done
BACKEND="${BACKENDS[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[0]}"
# -------------------------------
# Exclude input in output test
# -------------------------------
EXCLUDE_INPUT_IN_OUTPUT="true"
run_all_tests="false"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
launch_triton_server
run_cpp_trtllm_backend_tests
run_cpp_e2e_backend_tests
kill_triton_server
done
EXCLUDE_INPUT_IN_OUTPUT="false"
# -------------------------------
# Max queue delay microseconds
# -------------------------------
run_all_tests="false"
MAX_QUEUE_DELAY_MICROSECONDS="1000000"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
launch_triton_server
run_cpp_trtllm_backend_tests
run_cpp_e2e_backend_tests
kill_triton_server
done
MAX_QUEUE_DELAY_MICROSECONDS="0"
# -------------------------------
# Max queue size
# -------------------------------
run_all_tests="false"
MAX_QUEUE_SIZE="6"
TRITON_MAX_BATCH_SIZE="1"
BATCHING_STRATEGY="inflight_fused_batching"
for BACKEND in "${BACKENDS[@]}"; do
launch_triton_server
run_cpp_trtllm_queue_size_tests
kill_triton_server
done
MAX_QUEUE_SIZE="0"
TRITON_MAX_BATCH_SIZE="128"
BACKEND="${BACKENDS[0]}"
# -------------------------------
# Python BLS
# -------------------------------
ACCUMULATE_TOKENS=( "false" "true" )
E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" )
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do
for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do
if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then
continue
fi
launch_triton_server
run_cpp_e2e_backend_tests
kill_triton_server
done
done
done
E2E_MODEL_NAME="ensemble"
ACCUMULATE_TOKEN="false"
fi
if [ "$MODEL" = "gpt-ib-streaming" ]; then
DECOUPLED_MODE="True"
STREAMING="true"
run_all_tests="true"
for BACKEND in "${BACKENDS[@]}"; do
run_all_tests="true"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
for MAX_TOKENS_IN_KV_CACHE in "${MAX_TOKENS_IN_KV_CACHES[@]}"; do
for BATCH_SCHEDULER_POLICY in "${BATCH_SCHEDULER_POLICIES[@]}"; do
for KV_CACHE_FREE_GPU_MEM_FRACTION in "${KV_CACHE_FREE_GPU_MEM_FRACTIONS[@]}"; do
for ENABLE_CHUNKED_CONTEXT in "${ENABLE_CHUNKED_CONTEXTS[@]}"; do
# Because the runners are shared, the default value of 0.9 doesn't work, so skip
# if max_tokens_in_kv_cache is also empty
if [[ "${KV_CACHE_FREE_GPU_MEM_FRACTION}" == "" && "${MAX_TOKENS_IN_KV_CACHE}" == "" ]]; then
continue
fi
if [[ "${BATCHING_STRATEGY}" == "v1" && "${BATCH_SCHEDULER_POLICY}" == "max_utilization" ]]; then
continue
fi
# For V1, batchScheduler currently cannot properly estimate kvCache usage
if [[ "${BATCHING_STRATEGY}" == "v1" && "${MAX_TOKENS_IN_KV_CACHE}" != "" ]]; then
continue
fi
launch_triton_server
run_cpp_trtllm_backend_tests '../../tools/dataset/short_input_end_id.csv' '../../tools/dataset/short_output_end_id.csv' 268
run_cpp_e2e_backend_tests
kill_triton_server
run_all_tests="false"
done
done
done
done
done
done
BACKEND="${BACKENDS[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[0]}"
# --------------------
# Python BLS test
# --------------------
ACCUMULATE_TOKENS=( "false" "true" )
E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" )
run_all_tests="true"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do
for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do
if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then
continue
fi
launch_triton_server
run_cpp_e2e_backend_tests
kill_triton_server
done
done
done
E2E_MODEL_NAME="ensemble"
ACCUMULATE_TOKEN="false"
run_all_tests="false"
fi
if [ "$MODEL" = "mistral-ib-streaming" ]; then
DECOUPLED_MODE="True"
STREAM=("true" "false")
EXCLUDE_INPUT_IN_OUTPUT_OPTS=("true" "false")
run_all_tests="true"
MAX_BEAM_WIDTH="2"
DECODING_MODES=("top_k_top_p" "beam_search")
# --------------------
# Python BLS test
# --------------------
ACCUMULATE_TOKENS=( "false" )
E2E_MODEL_NAMES=( "tensorrt_llm_bls" )
run_all_tests="true"
for BATCHING_STRATEGY in "inflight_fused_batching"; do
for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do
for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do
for STREAMING in "${STREAM[@]}"; do
for EXCLUDE_INPUT_IN_OUTPUT in "${EXCLUDE_INPUT_IN_OUTPUT_OPTS[@]}"; do
for DECODING_MODE in ${DECODING_MODES[@]}; do
if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then
continue
fi
launch_triton_server
test_stop_words
kill_triton_server
done
done
done
done
done
done
MAX_BEAM_WIDTH="1"
E2E_MODEL_NAME="ensemble"
ACCUMULATE_TOKEN="false"
run_all_tests="false"
DECODING_MODE="top_k_top_p"
fi
if [ "$MODEL" = "gpt-ib-speculative-decoding-bls" ]; then
# --------------------
# Python BLS test
# --------------------
DECOUPLED_MODE="False"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
USE_DRAFT_LOGITS_VALUES=( "true" "false" )
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
for USE_DRAFT_LOGITS in "${USE_DRAFT_LOGITS_VALUES[@]}"; do
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
continue
fi
draft_args="--num-draft-tokens=5"
if [[ "${USE_DRAFT_LOGITS}" == "true" ]]; then
# with draft logit compare the outputs are not deterministic so we just
draft_args="--num-draft-tokens=5 --return-generation-logits --use-draft-logits --disable-output-comparison"
fi
ENABLE_KV_CACHE_REUSE="true"
launch_triton_server
# Test client
pushd tools/inflight_batcher_llm
python3 speculative_decoding_test.py \
--max-input-len 200 \
--dataset ../dataset/mini_cnn_eval_spec_decoding.json \
--url-target=localhost:8001 \
--url-draft=localhost:8001 \
--url-control=localhost:8001 \
--draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
--target-tensorrt-llm-model-name="${TENSORRT_LLM_TARGET_MODEL_NAME}" \
--bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \
--execute-bls-speculative-decoding \
${draft_args} \
--verbose
popd # inflight_batcher_llm/client
kill_triton_server
done
done
fi
if [ "$MODEL" = "gpt-ib-ptuning" ]; then
#Generate reference output
pushd $LLM_ROOT/examples/models/core/gpt
# Input with virtual tokens:
python3 $LLM_ROOT/examples/run.py \
--max_output_len=8 \
--vocab_file=c-model/email_composition/fp16/tokenizer.model \
--prompt_table_path=email_composition.npy \
--input_file=input.csv \
--engine_dir ${DECODER_ENGINE_PATH} \
--output_csv output_w_prompt.csv \
--enable_context_fmha_fp32_acc \
--no-kv_cache_enable_block_reuse
#Input w/o virtual tokens:
echo "25229,291,7379,251522,39854,5754,251514,315,32906,14297,398,261" > input_wo_prompt.csv
python3 $LLM_ROOT/examples/run.py \
--max_output_len=8 \
--vocab_file=c-model/email_composition/fp16/tokenizer.model \
--input_file=input_wo_prompt.csv \
--engine_dir ${DECODER_ENGINE_PATH} \
--output_csv output_wo_prompt.csv \
--enable_context_fmha_fp32_acc \
--no-kv_cache_enable_block_reuse
popd
DECOUPLED_MODE="False"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
ENABLE_CONTEXT_FMHA_FP32_ACC="True"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
continue
fi
launch_triton_server
# Test client
pushd inflight_batcher_llm/client
python3 inflight_batcher_llm_client.py \
--prompt-embedding-table $LLM_ROOT/examples/models/core/gpt/email_composition.npy \
--prompt-task-id 0 \
--input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input.csv \
--output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output_w_prompt.csv \
--check-output \
--request-output-len 8
python3 inflight_batcher_llm_client.py --input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input_wo_prompt.csv --output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output_wo_prompt.csv --check-output --request-output-len 8
popd # inflight_batcher_llm/client
kill_triton_server
done
fi
if [ "$MODEL" = "gpt-2b-ib-lora" ]; then
#Generate reference output
pushd $LLM_ROOT/examples/models/core/gpt
# Input with virtual tokens:
python3 $LLM_ROOT/examples/run.py \
--max_output_len=8 \
--lora_dir=gpt2b_lora-900.nemo \
--lora_ckpt_source nemo \
--lora_task_uids 0 \
--engine_dir ${DECODER_ENGINE_PATH} \
--input_file input.csv \
--output_csv output.csv \
--use_py_session \
--tokenizer_dir ${TOKENIZER_PATH}
popd
DECOUPLED_MODE="False"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
# LoRA is not supported in V1
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
continue
fi
launch_triton_server
# Test client
pushd inflight_batcher_llm/client
python3 inflight_batcher_llm_client.py \
--input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input.csv \
--output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output.csv \
--check-output --request-output-len 8 \
--lora-path $LLM_ROOT/examples/models/core/gpt/gpt-2b-lora-train-900 \
--lora-task-id 12345
python3 inflight_batcher_llm_client.py \
--input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input.csv \
--output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output.csv \
--check-output --request-output-len 8 \
--lora-task-id 12345
ACCUMULATE_TOKENS=( "false" "true" )
E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" )
for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do
for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do
if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then
continue
fi
python3 end_to_end_grpc_client.py \
${STREAMING_FLAG} \
--output-len 100 --prompt "After Washington had returned to Williamsburg, Dinwiddie ordered him to lead a larger force to assist Trent in his work. While en route, Washington learned of Trent's retreat. Since Tanaghrisson had promised support to the British, Washington continued toward Fort Duquesne and met with the Mingo leader. Learning of a French scouting party in the area, Washington, with Tanaghrisson and his party, surprised the Canadians on May 28 in what became known as the Battle of Jumonville Glen. They killed many of the Canadians, including their commanding officer, Joseph Coulon de Jumonville, whose head was reportedly split open by Tanaghrisson with a tomahawk. The historian Fred Anderson suggests that Tanaghrisson was acting to gain the support of the British and regain authority over his own people. They had been inclined to support the French, with whom they had long trading relationships. One of Tanaghrisson's men told Contrecoeur that Jumonville had been killed by British musket fire. Question: Upon learning of a French scounting party in the area, what did Washington do? Answer:" \
${OVERWRITE_OUTPUT_TEXT_FLAG} \
--lora-path $LLM_ROOT/examples/models/core/gpt/gpt-2b-lora-train-900 \
--lora-task-id 12345 \
--model-name "$E2E_MODEL_NAME" | tee "output_e2e_${E2E_MODEL_NAME}_${ACCUMULATE_TOKENS}"
grep "Answer: He surprised the Canadians on May 28 in what became known as the Battle of Jumonville" "output_e2e_${E2E_MODEL_NAME}_${ACCUMULATE_TOKENS}"
rt=$?
if [ ${rt} -ne 0 ]; then
echo "FAIL"
exit 1
else
echo "PASS"
fi
done
done
popd # inflight_batcher_llm/client
#run_cpp_e2e_backend_tests
kill_triton_server
done
fi
if [ "$MODEL" = "gpt-speculative-decoding" ]; then
DECOUPLED_MODE="False"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
# Speculative decoding is not supported in V1
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
continue
fi
TRITON_HTTP_PORT="8000"
TRITON_GRPC_PORT="8001"
TRITON_METRICS_PORT="8002"
ENABLE_KV_CACHE_REUSE="true"
launch_triton_server
TRITON_HTTP_PORT="8003"
TRITON_GRPC_PORT="8004"
TRITON_METRICS_PORT="8005"
# TODO(nkorobov): Draft model can benefit from enable KV cache.
# Add --enable_context_fmha --use_paged_context_fmha to its build command
ENABLE_KV_CACHE_REUSE="false"
launch_triton_server
# Test client
pushd tools/inflight_batcher_llm
python3 speculative_decoding_test.py \
--max-input-len 200 \
--dataset ../dataset/mini_cnn_eval_spec_decoding.json \
--url-draft localhost:8004 \
--url-target localhost:8001 \
--url-control localhost:8001 \
--draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
--target-tensorrt-llm-model-name="${TENSORRT_LLM_TARGET_MODEL_NAME}" \
--verbose
popd # inflight_batcher_llm/client
kill_triton_server
done
fi
if [ "$MODEL" = "gpt-disaggregated-serving-bls" ]; then
DECOUPLED_MODE="False"
STREAMING="false"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="0.2"
export TRTLLM_USE_MPI_KVCACHE="1"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
# Disaggregated Serving is not supported in v1 batching
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
continue
fi
launch_triton_server
run_cpp_e2e_backend_tests
kill_triton_server
done
export TRTLLM_USE_MPI_KVCACHE="0"
fi
if [ "$MODEL" = "gpt-gather-logits" ]; then
if [ "${DRAFT_ENGINE_PATH}" == "" ]; then
# normal gather logits test
DECOUPLED_MODE="False"
MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
ENABLE_TRT_OVERLAP="${ENABLE_TRT_OVERLAPS[0]}"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
launch_triton_server
# Test client
pushd inflight_batcher_llm/client
# Kaiyu: nvbugs 4796041
# python3 inflight_batcher_llm_client.py \
# --tokenizer-dir ${TOKENIZER_PATH} \
# --return-context-logits \
# --return-generation-logits
python3 inflight_batcher_llm_client.py \
--tokenizer-dir ${TOKENIZER_PATH}
popd # inflight_batcher_llm/client
pushd tools/inflight_batcher_llm
# Kaiyu: nvbugs 4796041
# python3 end_to_end_test.py \
# -i http \
# --max-input-len 192 \
# --return-context-logits \
# --return-generation-logits \
# --dataset ../dataset/mini_cnn_eval.json
python3 end_to_end_test.py \
-i http \
--max-input-len 192 \
--dataset ../dataset/mini_cnn_eval.json
popd # tools/inflight_batcher_llm
kill_triton_server
done
else
# test with speculative decoding
# speculative decoding return draft model draft token logits
# and target model accepted token logits
DECOUPLED_MODE="False"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
ENABLE_TRT_OVERLAP="${ENABLE_TRT_OVERLAPS[0]}"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
# Speculative decoding is not supported in V1
if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then
continue
fi
TRITON_HTTP_PORT="8000"
TRITON_GRPC_PORT="8001"
TRITON_METRICS_PORT="8002"
ENABLE_KV_CACHE_REUSE="true"
launch_triton_server
TRITON_HTTP_PORT="8003"
TRITON_GRPC_PORT="8004"
TRITON_METRICS_PORT="8005"
# TODO(nkorobov): Draft model can benefit from enable KV cache.
# Add --enable_context_fmha --use_paged_context_fmha to its build command
ENABLE_KV_CACHE_REUSE="false"
launch_triton_server
# Test client
pushd tools/inflight_batcher_llm
python3 speculative_decoding_test.py \
--max-input-len 128 \
--dataset ../dataset/mini_cnn_eval_spec_decoding.json \
--url-draft localhost:8004 \
--url-target localhost:8001 \
--url-control localhost:8001 \
--draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
--target-tensorrt-llm-model-name="${TENSORRT_LLM_TARGET_MODEL_NAME}" \
--num-draft-tokens=5 \
--return-target-model-accepted-token-logits \
--return-draft-model-draft-logits \
--verbose
popd # inflight_batcher_llm/client
kill_triton_server
done
fi
fi
if [ "$MODEL" = "medusa" ]; then
# To make sure that torch is not a dependency for C++ backend
# pip3 uninstall -y torch
# Test streaming
DECOUPLED_MODE="True"
STREAMING="true"
run_all_tests="true"
MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
BATCHING_STRATEGY="${BATCHING_STRATEGIES[0]}"
DECODING_MODE="medusa"
END_ID_MEDUSA=1284
MEDUSA_INPUT_IDS_PATH='../../tools/dataset/short_input_end_id_medusa.csv'
MEDUSA_OUTPUT_IDS_PATH='../../tools/dataset/short_output_end_id_medusa.csv'
launch_triton_server
run_cpp_trtllm_backend_tests ${MEDUSA_INPUT_IDS_PATH} ${MEDUSA_OUTPUT_IDS_PATH} ${END_ID_MEDUSA}
kill_triton_server
# FIXME: grpc e2e test returns different result (because it is Medusa and not GPT) and has some problems with spaces
# Test non-streaming
DECOUPLED_MODE="False"
launch_triton_server
# Test client
pushd inflight_batcher_llm/client
python3 inflight_batcher_llm_client.py \
--request-output-len=128 \
--end-id ${END_ID_MEDUSA} \
--request-id 1 \
--tokenizer-dir ${TOKENIZER_PATH} \
--input-tokens-csv ${MEDUSA_INPUT_IDS_PATH} \
--output-tokens-csv ${MEDUSA_OUTPUT_IDS_PATH} \
--check-output
popd # inflight_batcher_llm/client
kill_triton_server
fi
if [ "$MODEL" = "eagle" ]; then
# To make sure that torch is not a dependency for C++ backend
# pip3 uninstall -y torch
# Test streaming
DECOUPLED_MODE="True"
STREAMING="true"
run_all_tests="true"
MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
BATCHING_STRATEGY="${BATCHING_STRATEGIES[0]}"
# chunked context is not supported yet.
ENABLE_CHUNKED_CONTEXT="false"
DECODING_MODE="eagle"
END_ID_EAGLE=1284
# Use the same I/O files as eagle is based on the same vicuna-v1.3-7b as medusa.
EAGLE_INPUT_IDS_PATH='../../tools/dataset/short_input_end_id_medusa.csv'
EAGLE_OUTPUT_IDS_PATH='../../tools/dataset/short_output_end_id_eagle.csv'
for BACKEND in "${BACKENDS[@]}"; do
launch_triton_server
run_cpp_trtllm_backend_tests ${EAGLE_INPUT_IDS_PATH} ${EAGLE_OUTPUT_IDS_PATH} ${END_ID_EAGLE}
kill_triton_server
done
# FIXME: grpc e2e test returns different result (because it is Eagle and not GPT) and has some problems with spaces
# Test non-streaming
DECOUPLED_MODE="False"
launch_triton_server
# Test client
pushd inflight_batcher_llm/client
python3 inflight_batcher_llm_client.py \
--request-output-len=128 \
--end-id ${END_ID_EAGLE} \
--request-id 1 \
--tokenizer-dir ${TOKENIZER_PATH} \
--input-tokens-csv ${EAGLE_INPUT_IDS_PATH} \
--output-tokens-csv ${EAGLE_OUTPUT_IDS_PATH} \
--check-output
popd # inflight_batcher_llm/client
kill_triton_server
fi
if [ "$MODEL" = "bart-ib" ] || [ "$MODEL" = "t5-ib" ]; then
# Non-streaming tests, decoupled is false
DECOUPLED_MODE="False"
STREAMING="false"
# enc-dec models only support inflight_fused_batching, with chunked context disabled
CHECK_PERF_JSON_ARGS=""
BATCHING_STRATEGY="inflight_fused_batching"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
ENABLE_CHUNKED_CONTEXT="false"
CROSS_KV_CACHE_FRACTION="0.5"
# -------------------------------
# Param sweep test
# -------------------------------
run_all_tests="true"
for BACKEND in "${BACKENDS[@]}"; do
for MAX_TOKENS_IN_KV_CACHE in "${MAX_TOKENS_IN_KV_CACHES[@]}"; do
for KV_CACHE_FREE_GPU_MEM_FRACTION in "${KV_CACHE_FREE_GPU_MEM_FRACTIONS[@]}"; do
# Because the runners are shared, the default value of 0.9 doesn't work, so skip
# if max_tokens_in_kv_cache is also empty
if [[ "${KV_CACHE_FREE_GPU_MEM_FRACTION}" == "" && "${MAX_TOKENS_IN_KV_CACHE}" == "" ]]; then
continue
fi
#Encoder-decoder models are not yet supported in python backend
if [[ "${BACKEND}" == "python" ]]; then
continue
fi
launch_triton_server
run_cpp_trtllm_backend_tests
run_cpp_e2e_backend_tests
kill_triton_server
run_all_tests="false"
done
done
done
BACKEND="${BACKENDS[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
# -------------------------------
# Exclude input in output test
# -------------------------------
EXCLUDE_INPUT_IN_OUTPUT="true"
run_all_tests="false"
launch_triton_server
run_cpp_trtllm_backend_tests
run_cpp_e2e_backend_tests
kill_triton_server
EXCLUDE_INPUT_IN_OUTPUT="false"
# -------------------------------
# Max queue delay microseconds
# -------------------------------
run_all_tests="false"
MAX_QUEUE_DELAY_MICROSECONDS="1000000"
launch_triton_server
run_cpp_trtllm_backend_tests
run_cpp_e2e_backend_tests
kill_triton_server
MAX_QUEUE_DELAY_MICROSECONDS="0"
# -------------------------------
# Python BLS
# -------------------------------
ACCUMULATE_TOKENS=( "false" "true" )
E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" )
for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do
for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do
if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then
continue
fi
launch_triton_server
run_cpp_e2e_backend_tests
kill_triton_server
done
done
E2E_MODEL_NAME="ensemble"
ACCUMULATE_TOKEN="false"
# Reset
CROSS_KV_CACHE_FRACTION=""
fi
if [ "$MODEL" = "blip2-opt" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
# Test none-streaming
DECOUPLED_MODE="False"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
launch_triton_server
python3 tools/multimodal/client.py --model_type blip2 | tee multimodal_output
grep -oi "singapore" multimodal_output
kill_triton_server
done
# Test streaming
DECOUPLED_MODE="True"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
launch_triton_server
python3 tools/multimodal/client.py --model_type blip2 --streaming | tee multimodal_output
grep -oi "sing" multimodal_output
kill_triton_server
done
DECOUPLED_MODE="False"
# Python BLS
DECOUPLED_MODE="True"
ACCUMULATE_TOKENS=( "false" "true" )
E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" )
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do
for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do
if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then
continue
fi
launch_triton_server
python3 tools/multimodal/client.py --model_type blip2 --use_bls --streaming | tee multimodal_output
grep -oi "sing" multimodal_output
kill_triton_server
done
done
done
E2E_MODEL_NAME="ensemble"
ACCUMULATE_TOKEN="false"
DECOUPLED_MODE="False"
# Test kv cache reuse
ENABLE_KV_CACHE_REUSE="True"
for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do
launch_triton_server
python3 tools/multimodal/client.py --text "Question: Can you identify which city is depicted in this image based on the landmarks, architecture, and overall scenery? Please provide the name of the city along with any notable features that led you to your conclusion. Answer:" --model_type blip2 --prompt_table_extra_id 1
python3 tools/multimodal/client.py --text "Question: Can you identify which city is depicted in this image based on the landmarks, architecture, and overall scenery? Please provide the name of the city along with any notable features that led you to your conclusion. Answer:" --model_type blip2 --prompt_table_extra_id 1 | tee multimodal_output
grep -oi "singapore" multimodal_output
kill_triton_server
done
ENABLE_KV_CACHE_REUSE="False"
fi
if [ "$MODEL" = "mllama" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
CROSS_KV_CACHE_FRACTION="0.5"
BATCHING_STRATEGY="inflight_fused_batching"
# Test none-streaming
DECOUPLED_MODE="False"
for BACKEND in "${BACKENDS[@]}"; do
if [[ "${BACKEND}" == "python" ]]; then
continue
fi
launch_triton_server
python3 tools/multimodal/client.py --model_type mllama | tee multimodal_output
grep -oi "singapore" multimodal_output
kill_triton_server
done
# Test streaming
DECOUPLED_MODE="True"
for BACKEND in "${BACKENDS[@]}"; do
launch_triton_server
python3 tools/multimodal/client.py --model_type mllama --streaming | tee multimodal_output
grep -oi "singapore" multimodal_output
kill_triton_server
done
DECOUPLED_MODE="False"
CROSS_KV_CACHE_FRACTION=""
fi
if [ "$MODEL" = "whisper" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[1]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
# enc-dec models only support inflight_fused_batching, with chunked context disabled
BATCHING_STRATEGY="inflight_fused_batching"
ENABLE_CHUNKED_CONTEXT="false"
EXCLUDE_INPUT_IN_OUTPUT="true"
CROSS_KV_CACHE_FRACTION="0.5"
wget -nc https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav
# Test none-streaming
DECOUPLED_MODE="False"
pip install tiktoken soundfile
launch_triton_server
python3 tools/whisper/client.py --audio-path 1221-135766-0002.wav
kill_triton_server
# Test streaming
DECOUPLED_MODE="True"
launch_triton_server
python3 tools/whisper/client.py --audio-path 1221-135766-0002.wav --streaming
kill_triton_server
EXCLUDE_INPUT_IN_OUTPUT="false"
DECOUPLED_MODE="False"
CROSS_KV_CACHE_FRACTION=""
fi
if [ "$MODEL" = "llava_onevision" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
# Test none-streaming
DECOUPLED_MODE="False"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type llava_onevision --end-id 151645 --pad-id 151643 | tee multimodal_output
grep -oi "singapore" multimodal_output
kill_triton_server
# Test streaming
DECOUPLED_MODE="True"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type llava_onevision --streaming --end-id 151645 --pad-id 151643 | tee multimodal_output
grep -oi "sing" multimodal_output
kill_triton_server
# Test with video input
DECOUPLED_MODE="False"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
VIDEO_PATH=$TOKENIZER_PATH'/../video-neva/test_video/video_test.mp4'
python3 tools/multimodal/client.py --model_type llava_onevision --end-id 151645 --pad-id 151643 --text "What is in this video?" --video $VIDEO_PATH --video_num_frames 8 | tee multimodal_output
grep -oi "robotic hand" multimodal_output
kill_triton_server
fi
if [ "$MODEL" = "qwen2_vl" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
# Test none-streaming
DECOUPLED_MODE="False"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type qwen2_vl --end-id 151645 --pad-id 151643 | tee multimodal_output
grep -oi "Singapore" multimodal_output
kill_triton_server
# Test streaming
DECOUPLED_MODE="True"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type qwen2_vl --streaming --end-id 151645 --pad-id 151643 | tee multimodal_output
grep -oi "Singapore" multimodal_output
kill_triton_server
fi
if [ "$MODEL" = "llava" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[1]}"
ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[1]}"
echo "ENABLE_CHUNKED_CONTEXT: ${ENABLE_CHUNKED_CONTEXT}"
# Test none-streaming
DECOUPLED_MODE="False"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type llava --end-id 2 --pad-id 32001 | tee multimodal_output
grep -oi "Singapore" multimodal_output
kill_triton_server
# Test streaming
DECOUPLED_MODE="True"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type llava --streaming --end-id 2 --pad-id 32001 | tee multimodal_output
grep -oi "Singapore" multimodal_output
kill_triton_server
fi
if [ "$MODEL" = "llava_fp8" ]; then
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[1]}"
ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[1]}"
echo "ENABLE_CHUNKED_CONTEXT: ${ENABLE_CHUNKED_CONTEXT}"
# Test none-streaming
DECOUPLED_MODE="False"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type llava --end-id 2 --pad-id 32001 | tee multimodal_output
grep -oi "Singapore" multimodal_output
kill_triton_server
# Test streaming
DECOUPLED_MODE="True"
BATCHING_STRATEGY="inflight_fused_batching"
launch_triton_server
python3 tools/multimodal/client.py --model_type llava --streaming --end-id 2 --pad-id 32001 | tee multimodal_output
grep -oi "Singapore" multimodal_output
kill_triton_server
fi
if [ "$MODEL" = "gpt-ib-lad" ]; then
# Test streaming
DECOUPLED_MODE="False"
STREAMING="true"
run_all_tests="true"
MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}"
MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}"
BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}"
KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}"
BATCHING_STRATEGY="${BATCHING_STRATEGIES[0]}"
DECODING_MODE="lookahead"
# Lookahead parameters
LOOKAHEAD_WINDOW_SIZE=7
LOOKAHEAD_NGRAM_SIZE=7
LOOKAHEAD_VERIFICATION_SET_SIZE=7
LOOKAHEAD_CONFIG="--lookahead_config=[${LOOKAHEAD_WINDOW_SIZE},${LOOKAHEAD_NGRAM_SIZE},${LOOKAHEAD_VERIFICATION_SET_SIZE}]"
launch_triton_server
# Test client
pushd inflight_batcher_llm/client
python3 inflight_batcher_llm_client.py \
${LOOKAHEAD_CONFIG} \
--tokenizer-dir ${TOKENIZER_PATH}
popd # inflight_batcher_llm/client
kill_triton_server
fi
popd # $LLM_BACKEND_ROOT