#!/usr/bin/bash MODEL=$1 DECODER_ENGINE_PATH=$2 TOKENIZER_PATH=$3 TOKENIZER_TYPE=$4 DRAFT_ENGINE_PATH=$5 TARGET_ENGINE_PATH=$6 ENCODER_ENGINE_PATH=$7 MULTIMODAL_ENGINE_PATH=$8 set -ex set -o pipefail nvidia-smi pushd $LLM_BACKEND_ROOT source tools/utils.sh TRITON_REPO="triton_repo" kill_triton_server () { pkill -9 -f trtllmExecutorWorker || true pkill -9 -f tritonserver } # Kill titonserver if it is still pending from previous test kill_triton_server || true if [ "$MODEL" = "mistral" ] || [ "$MODEL" = "mistral-ib" ] || [ "$MODEL" = "mistral-ib-mm" ]; then MAX_ATTENTION_WINDOW_SIZE="2048" MAX_SEQUENCE_LEN="8704" # max_input_len + max_output_len elif [ "$MODEL" = "t5-ib" ] || [ "$MODEL" = "bart-ib" ]; then MAX_ATTENTION_WINDOW_SIZE="" MAX_SEQUENCE_LEN="4096" # for enc-dec, choose a sufficient size of max token in kv cache to avoid no free block error elif [ "$MODEL" = "whisper" ]; then MAX_ATTENTION_WINDOW_SIZE="" MAX_SEQUENCE_LEN="24000" # WAR to avoid no free block errors else MAX_ATTENTION_WINDOW_SIZE="" MAX_SEQUENCE_LEN="2048" fi if [ "$MODEL" = "mllama" ]; then ENCODER_INPUT_FEATURES_DTYPE="TYPE_BF16" else ENCODER_INPUT_FEATURES_DTYPE="TYPE_FP16" fi if [ "$MODEL" = "gpt" ] || [ "$MODEL" = "opt" ] || [ "$MODEL" = "llama" ] || [ "$MODEL" = "gptj" ] || [ "$MODEL" = "mistral" ]; then rm -rf ${TRITON_REPO} cp -R all_models/gpt ${TRITON_REPO} # Modify config.pbtxt python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt engine_dir:${DECODER_ENGINE_PATH} python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH} python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH} # Launch Triton Server mpirun --allow-run-as-root \ -n 1 /opt/tritonserver/bin/tritonserver \ --model-repository=${TRITON_REPO} \ --disable-auto-complete-config \ --backend-config=python,shm-region-prefix-name=prefix0_ : & export SERVER_PID=$! wait_for_server_ready ${SERVER_PID} 1200 ${TRITON_HTTP_PORT} pushd tools/gpt/ # Client python3 client.py \ --text="Born in north-east France, Soyer trained as a" \ --output_len=10 \ --protocol=http \ --tokenizer_dir ${TOKENIZER_PATH} python3 client.py \ --text="Born in north-east France, Soyer trained as a" \ --output_len=10 \ --protocol=grpc \ --tokenizer_dir ${TOKENIZER_PATH} # Async Client python3 client_async.py \ --text="Born in north-east France, Soyer trained as a" \ --output_len=10 \ --protocol=http \ --tokenizer_dir ${TOKENIZER_PATH} python3 client_async.py \ --text="Born in north-east France, Soyer trained as a" \ --output_len=10 \ --protocol=grpc \ --tokenizer_dir ${TOKENIZER_PATH} # End to end test python3 end_to_end_test.py \ --tokenizer_dir ${TOKENIZER_PATH} # Benchmark Core Model python3 benchmark_core_model.py \ --batch_size=8 --start_len=128 --output_len=20 \ --protocol=http --mode=sync python3 benchmark_core_model.py \ --batch_size=8 --start_len=128 --output_len=20 \ --protocol=grpc --mode=sync python3 benchmark_core_model.py \ --batch_size=8 --start_len=128 --output_len=20 \ --protocol=http --mode=async python3 benchmark_core_model.py \ --batch_size=8 --start_len=128 --output_len=20 \ --protocol=grpc --mode=async # Benchmark using Perf Analyzer python3 gen_input_data.py # FIXME(kaiyu): Uncomment this when perf_analyzer is available. # perf_analyzer -m tensorrt_llm -v \ # -b 8 --input-data input_data.json \ # --concurrency-range 2 \ # -i http \ # -u 'localhost:8000' # perf_analyzer -m tensorrt_llm -v \ # -b 8 --input-data input_data.json \ # --concurrency-range 2 \ # -i grpc \ # -u 'localhost:8001' kill ${SERVER_PID} popd # tools/gpt fi print_test_params () { echo "----------------------------------" echo " Test parameters:" echo "----------------------------------" echo "BACKEND: ${BACKEND}" echo "BATCHING_STRATEGY: ${BATCHING_STRATEGY}" echo "MAX_TOKENS_IN_KV_CACHE: ${MAX_TOKENS_IN_KV_CACHE}" echo "MAX_ATTENTION_WINDOW_SIZE: ${MAX_ATTENTION_WINDOW_SIZE}" echo "BATCH_SCHEDULER_POLICY: ${BATCH_SCHEDULER_POLICY}" echo "KV_CACHE_FREE_GPU_MEM_FRACTION: ${KV_CACHE_FREE_GPU_MEM_FRACTION}" echo "CROSS_KV_CACHE_FRACTION: ${CROSS_KV_CACHE_FRACTION}" echo "EXCLUDE_INPUT_IN_OUTPUT: ${EXCLUDE_INPUT_IN_OUTPUT}" echo "TRITON_MAX_BATCH_SIZE: ${TRITON_MAX_BATCH_SIZE}" echo "MAX_QUEUE_DELAY_MICROSECONDS: ${MAX_QUEUE_DELAY_MICROSECONDS}" echo "MAX_BEAM_WIDTH: ${MAX_BEAM_WIDTH}" echo "ENABLE_KV_CACHE_REUSE: ${ENABLE_KV_CACHE_REUSE}" echo "E2E_MODEL_NAME: ${E2E_MODEL_NAME}" echo "TENSORRT_LLM_MODEL_NAME: ${TENSORRT_LLM_MODEL_NAME}" echo "TENSORRT_LLM_TARGET_MODEL_NAME: ${TENSORRT_LLM_TARGET_MODEL_NAME}" echo "TENSORRT_LLM_DRAFT_MODEL_NAME: ${TENSORRT_LLM_DRAFT_MODEL_NAME}" echo "ACCUMULATE_TOKEN: ${ACCUMULATE_TOKEN}" echo "BLS_INSTANCE_COUNT: ${BLS_INSTANCE_COUNT}" echo "PREPROCESSING_INSTANCE_COUNT: ${PREPROCESSING_INSTANCE_COUNT}" echo "POSTPROCESSING_INSTANCE_COUNT: ${POSTPROCESSING_INSTANCE_COUNT}" echo "NORMALIZE_LOG_PROBS: ${NORMALIZE_LOG_PROBS}" echo "ENABLE_CHUNKED_CONTEXT: ${ENABLE_CHUNKED_CONTEXT}" echo "GPU_DEVICE_IDS: ${GPU_DEVICE_IDS}" echo "DECODING_MODE: ${DECODING_MODE}" echo "MAX_QUEUE_SIZE: ${MAX_QUEUE_SIZE}" echo "ENABLE_CONTEXT_FMHA_FP32_ACC: ${ENABLE_CONTEXT_FMHA_FP32_ACC}" echo "PROMPT_EMBEDDING_TABLE_DTYPE: ${PROMPT_EMBEDDING_TABLE_DTYPE}" echo "run_all_tests: ${run_all_tests}" echo "----------------------------------" } fill_triton_repo () { if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ]; then cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt fi if [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_target sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_target"/g' ${TRITON_REPO}/tensorrt_llm_target/config.pbtxt fi echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm with engine ${DECODER_ENGINE_PATH}" python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32,lookahead_window_size:${LOOKAHEAD_WINDOW_SIZE},lookahead_ngram_size:${LOOKAHEAD_NGRAM_SIZE},lookahead_verification_set_size:${LOOKAHEAD_VERIFICATION_SET_SIZE} python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT} python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT} python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ] && [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_TARGET_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE} else python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:"",prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE} fi if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ]; then echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm_draft with engine ${DRAFT_ENGINE_PATH}" python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32 fi if [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm_target with engine ${TARGET_ENGINE_PATH}" python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_target/config.pbtxt triton_backend:${BACKEND},engine_dir:${TARGET_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:true,normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32 fi # encoder-decoder model only if [ "${CROSS_KV_CACHE_FRACTION}" != "" ]; then python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt cross_kv_cache_fraction:${CROSS_KV_CACHE_FRACTION} fi if [ "${ENCODER_ENGINE_PATH}" != "" ] && [ "${ENCODER_ENGINE_PATH}" != "skip" ]; then python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt encoder_engine_dir:${ENCODER_ENGINE_PATH} fi if [ "${MULTIMODAL_ENGINE_PATH}" != "" ] && [ "${MULTIMODAL_ENGINE_PATH}" != "skip" ]; then cp all_models/multimodal/ensemble ${TRITON_REPO} -r cp all_models/multimodal/multimodal_encoders ${TRITON_REPO} -r python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${DECODER_ENGINE_PATH} python3 tools/fill_template.py -i ${TRITON_REPO}/multimodal_encoders/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},hf_model_path:${TOKENIZER_PATH} python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt multimodal_encoders_name:multimodal_encoders fi if [ "$MODEL" = "whisper" ]; then cp all_models/whisper/whisper_bls ${TRITON_REPO} -r rm -r ${TRITON_REPO}/preprocessing ${TRITON_REPO}/postprocessing ${TRITON_REPO}/ensemble ${TRITON_REPO}/tensorrt_llm_bls python3 tools/fill_template.py -i ${TRITON_REPO}/whisper_bls/config.pbtxt engine_dir:${ENCODER_ENGINE_PATH},n_mels:128,zero_pad:false,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE} wget -nc --directory-prefix=${TRITON_REPO}/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken wget -nc --directory-prefix=${TRITON_REPO}/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz fi if [ "$MODEL" = "gpt-disaggregated-serving-bls" ]; then cp -r ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/generation mv ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/context cp -r all_models/disaggregated_serving/disaggregated_serving_bls/ ${TRITON_REPO} python3 tools/fill_template.py -i ${TRITON_REPO}/disaggregated_serving_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},disaggregated_serving_bls_count:${BLS_INSTANCE_COUNT},context_model_name:context,generation_model_name:generation,logits_datatype:TYPE_FP32 mv ${TRITON_REPO}/disaggregated_serving_bls ${TRITON_REPO}/tensorrt_llm sed 's/name: "disaggregated_serving_bls"/name: "tensorrt_llm"/' -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt sed 's/name: "tensorrt_llm"/name: "context"/' -i ${TRITON_REPO}/context/config.pbtxt sed 's/name: "tensorrt_llm"/name: "generation"/' -i ${TRITON_REPO}/generation/config.pbtxt fi } launch_triton_server () { print_test_params rm -rf ${TRITON_REPO} cp -R all_models/inflight_batcher_llm ${TRITON_REPO} fill_triton_repo # Launch Triton Server /opt/tritonserver/bin/tritonserver \ --disable-auto-complete-config --model-repository=${TRITON_REPO} --http-port ${TRITON_HTTP_PORT} --grpc-port ${TRITON_GRPC_PORT} --metrics-port ${TRITON_METRICS_PORT} > log.txt 2>&1 & export SERVER_PID=$! wait_for_server_ready ${SERVER_PID} 1200 ${TRITON_HTTP_PORT} } test_stop_words() { # test to run for all combinations of flags EXCL_INPUT_IN_OUTPUT_FLAG="" [ "${EXCLUDE_INPUT_IN_OUTPUT}" = "true" ] && EXCL_INPUT_IN_OUTPUT_FLAG="--exclude-input-in-output" STREAMING_FLAG="" [ "${STREAMING}" = "true" ] && STREAMING_FLAG="--streaming" BEAM_FLAG="--beam-width 1" [ "${DECODING_MODE}" = "beam_search" ] && BEAM_FLAG="--beam-width 2" # Test client pushd inflight_batcher_llm/client if [[ $MODEL = "gpt-ib" ]]; then PROMPT="The only thing we have to fear is" OUTLEN=10 ORIGINAL_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" 2>&1 | tail -n 1) echo "original output" echo $ORIGINAL_OUTPUT # should be something like "[...] that the government will [...]" # examples of stop words that won't affect generation # "government" isn't tokenized like " government" # " that the public" doesn't match entirely the generated string TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words "government" " that the public" 2>&1 | tail -n 1) [[ "${ORIGINAL_OUTPUT}" == "${TEST_OUTPUT}" ]] # check that output finishes at "government" TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words " lorem" " government" 2>&1 | tail -n 1) [[ "${TEST_OUTPUT}" == *"government" ]] TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words " that the government" 2>&1 | tail -n 1) [[ "${TEST_OUTPUT}" == *"government" ]] else PROMPT="What does Jonathan mean?" OUTLEN=10 # Only the BLS backend supports stop word detection on word level. # The word Jonathan has multiple tokenizations which is not detected in TRT-LLM. TEST_OUTPUT=$(python3 end_to_end_grpc_client.py ${STREAMING_FLAG} -o ${OUTLEN} -p "${PROMPT}" --stop-words "Jonathan" --model-name "tensorrt_llm_bls" $BEAM_FLAG 2>&1 | tail -n 1) fi popd } run_cpp_trtllm_backend_tests () { # test to run for all combinations of flags EXCL_INPUT_IN_OUTPUT_FLAG="" [ "${EXCLUDE_INPUT_IN_OUTPUT}" = "true" ] && EXCL_INPUT_IN_OUTPUT_FLAG="--exclude-input-in-output" STREAMING_FLAG="" [ "${STREAMING}" = "true" ] && STREAMING_FLAG="--streaming" # Test client pushd inflight_batcher_llm/client if [ $MAX_ATTENTION_WINDOW_SIZE ]; then # test using a longer input # TODO: Once we switch to using real weights, add `--check-output` arg python3 inflight_batcher_llm_client.py \ ${STREAMING_FLAG} \ --tokenizer-dir ${TOKENIZER_PATH} \ --input-tokens-csv='../../tools/dataset/long_input.csv' \ --output-tokens-csv='../../tools/dataset/long_output.csv' \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ 2>&1 | tee output_long_input # If no prompt in output, check that output sequence isn't an empty list of tokens if $EXCL_INPUT_IN_OUTPUT_FLAG; then grep -o "Output sequence starts with: \[1, 3189, 28809, 28707, 7234, 574, 3441, 1236, 28723, 28705" output_long_input else grep -o "Output sequence\( starts with\)\?:\s*\[\([0-9]*\,\?\s\?\)*\]" output_long_input fi fi # testing output accuracy for real weights only CHECK_OUTPUT_FLAG="" if [ $MODEL = "gpt-ib" ]; then CHECK_OUTPUT_FLAG="--check-output" fi python3 inflight_batcher_llm_client.py \ ${STREAMING_FLAG} \ ${CHECK_OUTPUT_FLAG} \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ --tokenizer-dir ${TOKENIZER_PATH} #Check that metrics work as expected by looking at number of successful requests for tensorrt_llm num_success=$(curl localhost:${TRITON_METRICS_PORT}/metrics 2>&1 | grep nv_inference_request_success\{model=\"tensorrt_llm\" | cut -d " " -f 2) if (( num_success <= 0 )); then exit 1 else echo "Number of successful requests: $num_success" fi if [[ "$run_all_tests" == "true" && "$BATCHING_STRATEGY" == "inflight_fused_batching" ]]; then # testing output accuracy for real weights only if [[ $MODEL = "gpt-ib" ]] || [[ $MODEL = "mistral-ib-streaming" ]]; then popd test_stop_words pushd inflight_batcher_llm/client fi # Stop request python3 inflight_batcher_llm_client.py \ ${STREAMING_FLAG} \ --request-output-len=128 \ --stop-after-ms 100 \ --tokenizer-dir ${TOKENIZER_PATH} \ --request-id 1 \ 2>&1 | tee output_w_stop grep "Got cancellation response" output_w_stop if [[ "${STREAMING}" == "true" ]]; then # Request cancellation python3 inflight_batcher_llm_client.py \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ --streaming \ --request-output-len=128 \ --stop-after-ms 100 \ --request-id 1 \ --stop-via-request-cancel \ --tokenizer-dir ${TOKENIZER_PATH} 2>&1 | tee output_w_stop grep "Request is cancelled" output_w_stop fi if [[ -n "${1}" && -n "${2}" && -n "${3}" ]]; then python3 inflight_batcher_llm_client.py \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ ${STREAMING_FLAG} \ --request-output-len=128 \ --end-id $3 \ --request-id 1 \ --tokenizer-dir ${TOKENIZER_PATH} \ --input-tokens-csv=$1 \ --output-tokens-csv=$2 \ --check-output fi #test with return log probs python3 inflight_batcher_llm_client.py \ ${STREAMING_FLAG} \ --request-output-len=10 \ --tokenizer-dir ${TOKENIZER_PATH} \ --return-log-probs --top-k 2 \ 2>&1 | tee output_log_probs #test with string request id python3 inflight_batcher_llm_client.py \ ${STREAMING_FLAG} \ ${CHECK_OUTPUT_FLAG} \ --tokenizer-dir ${TOKENIZER_PATH} \ --request-id my_request 2>&1 | tee output_str_request # n-return requires the decoupled mode. if [[ "${DECOUPLED_MODE}" == "True" ]]; then #test with n returns python3 inflight_batcher_llm_client.py \ ${STREAMING_FLAG} \ ${CHECK_OUTPUT_FLAG} \ --tokenizer-dir ${TOKENIZER_PATH} \ --num-return-sequences 2 2>&1 | tee output_n_return fi # Test triton metrics are present and have non-zero values (when applicable). TRITON_METRICS_LOG="triton_metrics.out" curl localhost:${TRITON_METRICS_PORT}/metrics -o ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="context",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="scheduled",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="max",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="active",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_request_metrics\{model="tensorrt_llm",request_type="waiting",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_runtime_memory_metrics\{memory_type="pinned",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_runtime_memory_metrics\{memory_type="gpu",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_runtime_memory_metrics\{memory_type="cpu",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="used",model="tensorrt_llm",version="1"\} [0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="free",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="max",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_kv_cache_block_metrics\{kv_cache_block_type="fraction",model="tensorrt_llm",version="1"\} [0-9]*\.?[0-9]+$' ${TRITON_METRICS_LOG} if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then grep -E 'nv_trt_llm_inflight_batcher_metrics\{model="tensorrt_llm",v1_specific_metric="num_ctx_tokens",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_inflight_batcher_metrics\{model="tensorrt_llm",v1_specific_metric="num_gen_tokens",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_inflight_batcher_metrics\{model="tensorrt_llm",v1_specific_metric="empty_gen_slots",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} else grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="paused_requests",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="micro_batch_id",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="generation_requests",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_inflight_batcher_metrics\{inflight_batcher_specific_metric="total_context_tokens",model="tensorrt_llm",version="1"\} [0-9]+$' ${TRITON_METRICS_LOG} fi grep -E 'nv_trt_llm_general_metrics\{general_type="iteration_counter",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} grep -E 'nv_trt_llm_general_metrics\{general_type="timestamp",model="tensorrt_llm",version="1"\} [1-9][0-9]*$' ${TRITON_METRICS_LOG} rm ${TRITON_METRICS_LOG} fi popd # inflight_batcher_llm/client # End to end test pushd tools/inflight_batcher_llm # HTTP client cannot be used with decoupled mode. if [[ "${DECOUPLED_MODE}" == "False" ]]; then python3 benchmark_core_model.py \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ --concurrency 8 \ -i http \ --max-input-len 300 \ dataset \ --dataset ../dataset/mini_cnn_eval.json \ --tokenizer-dir ${TOKENIZER_PATH} fi if [[ "$run_all_tests" == "true" ]]; then # Note: streaming flag is not set to 1 for these benchmarks regardless # of the value of $STREAMING. DECOUPLED_FLAG="" [ "${DECOUPLED_MODE}" = "True" ] && DECOUPLED_FLAG="--decoupled" python3 benchmark_core_model.py \ ${DECOUPLED_FLAG} \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ --concurrency 8 \ -i grpc \ --max-input-len 300 \ --num-requests 80 \ dataset \ --dataset ../dataset/mini_cnn_eval.json \ --tokenizer-dir ${TOKENIZER_PATH} # Performance check. python3 benchmark_core_model.py \ ${DECOUPLED_FLAG} \ ${CHECK_PERF_JSON_ARGS} \ --check-perf-key ${MODEL}-${BACKEND} \ --check-perf-rtol 0.05 \ --check-perf-atol 50 \ --concurrency 8 \ -i grpc \ --max-input-len 300 \ --request-rate -1 \ --num-requests 1000 \ token-norm-dist \ --input-mean 128 --input-stdev 0 \ --output-mean 20 --output-stdev 0 python3 benchmark_core_model.py \ ${DECOUPLED_FLAG} \ -i grpc --max-input-len 1000 \ --request-rate -1 \ token-from-histogram --histogram-key example fi popd # tools/inflight_batcher_llm } run_cpp_e2e_backend_tests () { STREAMING_FLAG="" [ "${STREAMING}" = "true" ] && STREAMING_FLAG="--streaming" OVERWRITE_OUTPUT_TEXT_FLAG="" [ "${ACCUMULATE_TOKEN}" = "true" ] && OVERWRITE_OUTPUT_TEXT_FLAG="--overwrite-output-text" EXCL_INPUT_IN_OUTPUT_FLAG="" [ "${EXCLUDE_INPUT_IN_OUTPUT}" = "true" ] && EXCL_INPUT_IN_OUTPUT_FLAG="--exclude-input-in-output" pushd inflight_batcher_llm/client # testing output accuracy for real weights only if [[ $MODEL = "gpt-ib" || $MODEL = "gpt-ib-streaming" ]]; then python3 end_to_end_grpc_client.py \ ${STREAMING_FLAG} \ --output-len 10 --prompt "The only thing we have to fear is" \ ${OVERWRITE_OUTPUT_TEXT_FLAG} \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ --model-name "$E2E_MODEL_NAME" | tee output_e2e grep "that the government will" output_e2e if [[ "$EXCL_INPUT_IN_OUTPUT_FLAG" != "" ]]; then grep -v "The only thing we have to fear is" output_e2e fi if [[ "$run_all_tests" == "true" && "$BATCHING_STRATEGY" == "inflight_fused_batching" ]]; then # test with embedding bias python3 end_to_end_grpc_client.py \ ${STREAMING_FLAG} \ ${OVERWRITE_OUTPUT_TEXT_FLAG} \ -o 10 \ -p "The only thing we have to fear is" \ --embedding-bias-words " government" \ --embedding-bias-weights -20 \ --model-name "$E2E_MODEL_NAME" \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ 2>&1 | tee output_w_bias grep -v "that the government will" output_w_bias if [[ "$EXCL_INPUT_IN_OUTPUT_FLAG" != "" ]]; then grep -v "The only thing we have to fear is" output_e2e fi #Only run batched test in streaming for now since it requires decoupled mode if [[ "$DECOUPLED_MODE" == "true" ]]; then # test with batched requests python3 end_to_end_grpc_client.py \ ${STREAMING_FLAG} \ ${OVERWRITE_OUTPUT_TEXT_FLAG} \ ${EXCL_INPUT_IN_OUTPUT_FLAG} \ -o 5 \ --model-name "$E2E_MODEL_NAME" \ -p '["This is a test","I want you to","The cat is"]' \ --batch-inputs --check-outputs --expected-outputs '[" of the power of the"," know that I am not"," a very good cat."]' fi fi fi popd # inflight_batcher_llm/client # End to end test pushd tools/inflight_batcher_llm # end_to_end_test.py doesn't support streaming if [[ "${STREAMING}" == "false" ]]; then python3 end_to_end_test.py \ --concurrency 8 \ -i http \ --max-input-len 200 \ --test-bls \ --dataset ../dataset/mini_cnn_eval.json if [[ "$run_all_tests" == "true" ]]; then python3 end_to_end_test.py \ --concurrency 8 \ -i grpc \ --max-input-len 200 \ --test-bls \ --dataset ../dataset/mini_cnn_eval.json fi fi popd # tools/inflight_batcher_llm } run_cpp_trtllm_queue_size_tests () { # Test client echo "25229,291,7379,251522,39854,5754,251514,315,32906,14297,398,261" > input.csv pushd tools/inflight_batcher_llm EXTRA_FLAGS="" if [[ "${DECOUPLED_MODE}" == "True" ]]; then EXTRA_FLAGS="-p grpc -u localhost:8001" fi python3 test_max_queue_size.py --input-tokens-csv ../../input.csv --request-output-len 256 --num-requests 100 ${EXTRA_FLAGS} popd # tools/inflight_batcher_llm } BACKENDS=( "tensorrtllm" "python" ) BATCHING_STRATEGIES=( "inflight_fused_batching" ) MAX_TOKENS_IN_KV_CACHES=( "" $MAX_SEQUENCE_LEN ) BATCH_SCHEDULER_POLICIES=( "guaranteed_no_evict" "max_utilization" ) KV_CACHE_FREE_GPU_MEM_FRACTIONS=( "0.2" "" ) CROSS_KV_CACHE_FRACTION="" ENABLE_CHUNKED_CONTEXTS=( "false" "true" ) BACKEND="tensorrtllm" TRITON_MAX_BATCH_SIZE="128" MAX_QUEUE_DELAY_MICROSECONDS="0" MAX_BEAM_WIDTH="1" ENABLE_KV_CACHE_REUSE="false" E2E_MODEL_NAME="ensemble" TENSORRT_LLM_MODEL_NAME="tensorrt_llm" TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft" TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm_target" ACCUMULATE_TOKEN="false" EXCLUDE_INPUT_IN_OUTPUT="false" BLS_INSTANCE_COUNT="1" PREPROCESSING_INSTANCE_COUNT="1" POSTPROCESSING_INSTANCE_COUNT="1" NORMALIZE_LOG_PROBS="true" TRITON_HTTP_PORT="8000" TRITON_GRPC_PORT="8001" TRITON_METRICS_PORT="8002" GPU_DEVICE_IDS="" DECODING_MODE="top_k_top_p" MAX_QUEUE_SIZE="0" PROMPT_EMBEDDING_TABLE_DTYPE="TYPE_FP16" if [ "$MODEL" = "gpt-ib" ] || [ "$MODEL" = "mistral-ib" ] || [ "$MODEL" = "mistral-ib-mm" ]; then # Non-streaming tests, decoupled is false DECOUPLED_MODE="False" STREAMING="false" # ------------------------------- # Param sweep test # ------------------------------- run_all_tests="true" for BACKEND in "${BACKENDS[@]}"; do for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do for MAX_TOKENS_IN_KV_CACHE in "${MAX_TOKENS_IN_KV_CACHES[@]}"; do for BATCH_SCHEDULER_POLICY in "${BATCH_SCHEDULER_POLICIES[@]}"; do for KV_CACHE_FREE_GPU_MEM_FRACTION in "${KV_CACHE_FREE_GPU_MEM_FRACTIONS[@]}"; do for ENABLE_CHUNKED_CONTEXT in "${ENABLE_CHUNKED_CONTEXTS[@]}"; do # Because the runners are shared, the default value of 0.9 doesn't work, so skip # if max_tokens_in_kv_cache is also empty if [[ "${KV_CACHE_FREE_GPU_MEM_FRACTION}" == "" && "${MAX_TOKENS_IN_KV_CACHE}" == "" ]]; then continue fi if [[ "${BATCHING_STRATEGY}" == "v1" && "${BATCH_SCHEDULER_POLICY}" == "max_utilization" ]]; then continue fi # For V1, batchScheduler currently cannot properly estimate kvCache usage if [[ "${BATCHING_STRATEGY}" == "v1" && "${MAX_TOKENS_IN_KV_CACHE}" != "" ]]; then continue fi # mistral is built without chunked context support if [[ "$MODEL" = "mistral-ib" && "${ENABLE_CHUNKED_CONTEXT}" == "true" ]]; then continue fi if [[ "$MODEL" = "mistral-ib-mm" && "${ENABLE_CHUNKED_CONTEXT}" == "true" ]]; then continue fi if [[ "$MODEL" = "mistral-ib-mm" ]]; then export TRTLLM_ORCHESTRATOR=1 fi launch_triton_server run_cpp_trtllm_backend_tests run_cpp_e2e_backend_tests kill_triton_server run_all_tests="false" done done done done done done BACKEND="${BACKENDS[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[0]}" # ------------------------------- # Exclude input in output test # ------------------------------- EXCLUDE_INPUT_IN_OUTPUT="true" run_all_tests="false" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do launch_triton_server run_cpp_trtllm_backend_tests run_cpp_e2e_backend_tests kill_triton_server done EXCLUDE_INPUT_IN_OUTPUT="false" # ------------------------------- # Max queue delay microseconds # ------------------------------- run_all_tests="false" MAX_QUEUE_DELAY_MICROSECONDS="1000000" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do launch_triton_server run_cpp_trtllm_backend_tests run_cpp_e2e_backend_tests kill_triton_server done MAX_QUEUE_DELAY_MICROSECONDS="0" # ------------------------------- # Max queue size # ------------------------------- run_all_tests="false" MAX_QUEUE_SIZE="6" TRITON_MAX_BATCH_SIZE="1" BATCHING_STRATEGY="inflight_fused_batching" for BACKEND in "${BACKENDS[@]}"; do launch_triton_server run_cpp_trtllm_queue_size_tests kill_triton_server done MAX_QUEUE_SIZE="0" TRITON_MAX_BATCH_SIZE="128" BACKEND="${BACKENDS[0]}" # ------------------------------- # Python BLS # ------------------------------- ACCUMULATE_TOKENS=( "false" "true" ) E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" ) for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then continue fi launch_triton_server run_cpp_e2e_backend_tests kill_triton_server done done done E2E_MODEL_NAME="ensemble" ACCUMULATE_TOKEN="false" fi if [ "$MODEL" = "gpt-ib-streaming" ]; then DECOUPLED_MODE="True" STREAMING="true" run_all_tests="true" for BACKEND in "${BACKENDS[@]}"; do run_all_tests="true" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do for MAX_TOKENS_IN_KV_CACHE in "${MAX_TOKENS_IN_KV_CACHES[@]}"; do for BATCH_SCHEDULER_POLICY in "${BATCH_SCHEDULER_POLICIES[@]}"; do for KV_CACHE_FREE_GPU_MEM_FRACTION in "${KV_CACHE_FREE_GPU_MEM_FRACTIONS[@]}"; do for ENABLE_CHUNKED_CONTEXT in "${ENABLE_CHUNKED_CONTEXTS[@]}"; do # Because the runners are shared, the default value of 0.9 doesn't work, so skip # if max_tokens_in_kv_cache is also empty if [[ "${KV_CACHE_FREE_GPU_MEM_FRACTION}" == "" && "${MAX_TOKENS_IN_KV_CACHE}" == "" ]]; then continue fi if [[ "${BATCHING_STRATEGY}" == "v1" && "${BATCH_SCHEDULER_POLICY}" == "max_utilization" ]]; then continue fi # For V1, batchScheduler currently cannot properly estimate kvCache usage if [[ "${BATCHING_STRATEGY}" == "v1" && "${MAX_TOKENS_IN_KV_CACHE}" != "" ]]; then continue fi launch_triton_server run_cpp_trtllm_backend_tests '../../tools/dataset/short_input_end_id.csv' '../../tools/dataset/short_output_end_id.csv' 268 run_cpp_e2e_backend_tests kill_triton_server run_all_tests="false" done done done done done done BACKEND="${BACKENDS[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[0]}" # -------------------- # Python BLS test # -------------------- ACCUMULATE_TOKENS=( "false" "true" ) E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" ) run_all_tests="true" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then continue fi launch_triton_server run_cpp_e2e_backend_tests kill_triton_server done done done E2E_MODEL_NAME="ensemble" ACCUMULATE_TOKEN="false" run_all_tests="false" fi if [ "$MODEL" = "mistral-ib-streaming" ]; then DECOUPLED_MODE="True" STREAM=("true" "false") EXCLUDE_INPUT_IN_OUTPUT_OPTS=("true" "false") run_all_tests="true" MAX_BEAM_WIDTH="2" DECODING_MODES=("top_k_top_p" "beam_search") # -------------------- # Python BLS test # -------------------- ACCUMULATE_TOKENS=( "false" ) E2E_MODEL_NAMES=( "tensorrt_llm_bls" ) run_all_tests="true" for BATCHING_STRATEGY in "inflight_fused_batching"; do for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do for STREAMING in "${STREAM[@]}"; do for EXCLUDE_INPUT_IN_OUTPUT in "${EXCLUDE_INPUT_IN_OUTPUT_OPTS[@]}"; do for DECODING_MODE in ${DECODING_MODES[@]}; do if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then continue fi launch_triton_server test_stop_words kill_triton_server done done done done done done MAX_BEAM_WIDTH="1" E2E_MODEL_NAME="ensemble" ACCUMULATE_TOKEN="false" run_all_tests="false" DECODING_MODE="top_k_top_p" fi if [ "$MODEL" = "gpt-ib-speculative-decoding-bls" ]; then # -------------------- # Python BLS test # -------------------- DECOUPLED_MODE="False" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" USE_DRAFT_LOGITS_VALUES=( "true" "false" ) for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do for USE_DRAFT_LOGITS in "${USE_DRAFT_LOGITS_VALUES[@]}"; do if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then continue fi draft_args="--num-draft-tokens=5" if [[ "${USE_DRAFT_LOGITS}" == "true" ]]; then # with draft logit compare the outputs are not deterministic so we just draft_args="--num-draft-tokens=5 --return-generation-logits --use-draft-logits --disable-output-comparison" fi ENABLE_KV_CACHE_REUSE="true" launch_triton_server # Test client pushd tools/inflight_batcher_llm python3 speculative_decoding_test.py \ --max-input-len 200 \ --dataset ../dataset/mini_cnn_eval_spec_decoding.json \ --url-target=localhost:8001 \ --url-draft=localhost:8001 \ --url-control=localhost:8001 \ --draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \ --target-tensorrt-llm-model-name="${TENSORRT_LLM_TARGET_MODEL_NAME}" \ --bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \ --execute-bls-speculative-decoding \ ${draft_args} \ --verbose popd # inflight_batcher_llm/client kill_triton_server done done fi if [ "$MODEL" = "gpt-ib-ptuning" ]; then #Generate reference output pushd $LLM_ROOT/examples/models/core/gpt # Input with virtual tokens: python3 $LLM_ROOT/examples/run.py \ --max_output_len=8 \ --vocab_file=c-model/email_composition/fp16/tokenizer.model \ --prompt_table_path=email_composition.npy \ --input_file=input.csv \ --engine_dir ${DECODER_ENGINE_PATH} \ --output_csv output_w_prompt.csv \ --enable_context_fmha_fp32_acc \ --no-kv_cache_enable_block_reuse #Input w/o virtual tokens: echo "25229,291,7379,251522,39854,5754,251514,315,32906,14297,398,261" > input_wo_prompt.csv python3 $LLM_ROOT/examples/run.py \ --max_output_len=8 \ --vocab_file=c-model/email_composition/fp16/tokenizer.model \ --input_file=input_wo_prompt.csv \ --engine_dir ${DECODER_ENGINE_PATH} \ --output_csv output_wo_prompt.csv \ --enable_context_fmha_fp32_acc \ --no-kv_cache_enable_block_reuse popd DECOUPLED_MODE="False" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" ENABLE_CONTEXT_FMHA_FP32_ACC="True" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then continue fi launch_triton_server # Test client pushd inflight_batcher_llm/client python3 inflight_batcher_llm_client.py \ --prompt-embedding-table $LLM_ROOT/examples/models/core/gpt/email_composition.npy \ --prompt-task-id 0 \ --input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input.csv \ --output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output_w_prompt.csv \ --check-output \ --request-output-len 8 python3 inflight_batcher_llm_client.py --input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input_wo_prompt.csv --output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output_wo_prompt.csv --check-output --request-output-len 8 popd # inflight_batcher_llm/client kill_triton_server done fi if [ "$MODEL" = "gpt-2b-ib-lora" ]; then #Generate reference output pushd $LLM_ROOT/examples/models/core/gpt # Input with virtual tokens: python3 $LLM_ROOT/examples/run.py \ --max_output_len=8 \ --lora_dir=gpt2b_lora-900.nemo \ --lora_ckpt_source nemo \ --lora_task_uids 0 \ --engine_dir ${DECODER_ENGINE_PATH} \ --input_file input.csv \ --output_csv output.csv \ --use_py_session \ --tokenizer_dir ${TOKENIZER_PATH} popd DECOUPLED_MODE="False" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do # LoRA is not supported in V1 if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then continue fi launch_triton_server # Test client pushd inflight_batcher_llm/client python3 inflight_batcher_llm_client.py \ --input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input.csv \ --output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output.csv \ --check-output --request-output-len 8 \ --lora-path $LLM_ROOT/examples/models/core/gpt/gpt-2b-lora-train-900 \ --lora-task-id 12345 python3 inflight_batcher_llm_client.py \ --input-tokens-csv $LLM_ROOT/examples/models/core/gpt/input.csv \ --output-tokens-csv $LLM_ROOT/examples/models/core/gpt/output.csv \ --check-output --request-output-len 8 \ --lora-task-id 12345 ACCUMULATE_TOKENS=( "false" "true" ) E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" ) for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then continue fi python3 end_to_end_grpc_client.py \ ${STREAMING_FLAG} \ --output-len 100 --prompt "After Washington had returned to Williamsburg, Dinwiddie ordered him to lead a larger force to assist Trent in his work. While en route, Washington learned of Trent's retreat. Since Tanaghrisson had promised support to the British, Washington continued toward Fort Duquesne and met with the Mingo leader. Learning of a French scouting party in the area, Washington, with Tanaghrisson and his party, surprised the Canadians on May 28 in what became known as the Battle of Jumonville Glen. They killed many of the Canadians, including their commanding officer, Joseph Coulon de Jumonville, whose head was reportedly split open by Tanaghrisson with a tomahawk. The historian Fred Anderson suggests that Tanaghrisson was acting to gain the support of the British and regain authority over his own people. They had been inclined to support the French, with whom they had long trading relationships. One of Tanaghrisson's men told Contrecoeur that Jumonville had been killed by British musket fire. Question: Upon learning of a French scounting party in the area, what did Washington do? Answer:" \ ${OVERWRITE_OUTPUT_TEXT_FLAG} \ --lora-path $LLM_ROOT/examples/models/core/gpt/gpt-2b-lora-train-900 \ --lora-task-id 12345 \ --model-name "$E2E_MODEL_NAME" | tee "output_e2e_${E2E_MODEL_NAME}_${ACCUMULATE_TOKENS}" grep "Answer: He surprised the Canadians on May 28 in what became known as the Battle of Jumonville" "output_e2e_${E2E_MODEL_NAME}_${ACCUMULATE_TOKENS}" rt=$? if [ ${rt} -ne 0 ]; then echo "FAIL" exit 1 else echo "PASS" fi done done popd # inflight_batcher_llm/client #run_cpp_e2e_backend_tests kill_triton_server done fi if [ "$MODEL" = "gpt-speculative-decoding" ]; then DECOUPLED_MODE="False" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do # Speculative decoding is not supported in V1 if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then continue fi TRITON_HTTP_PORT="8000" TRITON_GRPC_PORT="8001" TRITON_METRICS_PORT="8002" ENABLE_KV_CACHE_REUSE="true" launch_triton_server TRITON_HTTP_PORT="8003" TRITON_GRPC_PORT="8004" TRITON_METRICS_PORT="8005" # TODO(nkorobov): Draft model can benefit from enable KV cache. # Add --enable_context_fmha --use_paged_context_fmha to its build command ENABLE_KV_CACHE_REUSE="false" launch_triton_server # Test client pushd tools/inflight_batcher_llm python3 speculative_decoding_test.py \ --max-input-len 200 \ --dataset ../dataset/mini_cnn_eval_spec_decoding.json \ --url-draft localhost:8004 \ --url-target localhost:8001 \ --url-control localhost:8001 \ --draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \ --target-tensorrt-llm-model-name="${TENSORRT_LLM_TARGET_MODEL_NAME}" \ --verbose popd # inflight_batcher_llm/client kill_triton_server done fi if [ "$MODEL" = "gpt-disaggregated-serving-bls" ]; then DECOUPLED_MODE="False" STREAMING="false" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="0.2" export TRTLLM_USE_MPI_KVCACHE="1" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do # Disaggregated Serving is not supported in v1 batching if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then continue fi launch_triton_server run_cpp_e2e_backend_tests kill_triton_server done export TRTLLM_USE_MPI_KVCACHE="0" fi if [ "$MODEL" = "gpt-gather-logits" ]; then if [ "${DRAFT_ENGINE_PATH}" == "" ]; then # normal gather logits test DECOUPLED_MODE="False" MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" ENABLE_TRT_OVERLAP="${ENABLE_TRT_OVERLAPS[0]}" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do launch_triton_server # Test client pushd inflight_batcher_llm/client # Kaiyu: nvbugs 4796041 # python3 inflight_batcher_llm_client.py \ # --tokenizer-dir ${TOKENIZER_PATH} \ # --return-context-logits \ # --return-generation-logits python3 inflight_batcher_llm_client.py \ --tokenizer-dir ${TOKENIZER_PATH} popd # inflight_batcher_llm/client pushd tools/inflight_batcher_llm # Kaiyu: nvbugs 4796041 # python3 end_to_end_test.py \ # -i http \ # --max-input-len 192 \ # --return-context-logits \ # --return-generation-logits \ # --dataset ../dataset/mini_cnn_eval.json python3 end_to_end_test.py \ -i http \ --max-input-len 192 \ --dataset ../dataset/mini_cnn_eval.json popd # tools/inflight_batcher_llm kill_triton_server done else # test with speculative decoding # speculative decoding return draft model draft token logits # and target model accepted token logits DECOUPLED_MODE="False" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" ENABLE_TRT_OVERLAP="${ENABLE_TRT_OVERLAPS[0]}" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do # Speculative decoding is not supported in V1 if [[ "${BATCHING_STRATEGY}" == "v1" ]]; then continue fi TRITON_HTTP_PORT="8000" TRITON_GRPC_PORT="8001" TRITON_METRICS_PORT="8002" ENABLE_KV_CACHE_REUSE="true" launch_triton_server TRITON_HTTP_PORT="8003" TRITON_GRPC_PORT="8004" TRITON_METRICS_PORT="8005" # TODO(nkorobov): Draft model can benefit from enable KV cache. # Add --enable_context_fmha --use_paged_context_fmha to its build command ENABLE_KV_CACHE_REUSE="false" launch_triton_server # Test client pushd tools/inflight_batcher_llm python3 speculative_decoding_test.py \ --max-input-len 128 \ --dataset ../dataset/mini_cnn_eval_spec_decoding.json \ --url-draft localhost:8004 \ --url-target localhost:8001 \ --url-control localhost:8001 \ --draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \ --target-tensorrt-llm-model-name="${TENSORRT_LLM_TARGET_MODEL_NAME}" \ --num-draft-tokens=5 \ --return-target-model-accepted-token-logits \ --return-draft-model-draft-logits \ --verbose popd # inflight_batcher_llm/client kill_triton_server done fi fi if [ "$MODEL" = "medusa" ]; then # To make sure that torch is not a dependency for C++ backend # pip3 uninstall -y torch # Test streaming DECOUPLED_MODE="True" STREAMING="true" run_all_tests="true" MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" BATCHING_STRATEGY="${BATCHING_STRATEGIES[0]}" DECODING_MODE="medusa" END_ID_MEDUSA=1284 MEDUSA_INPUT_IDS_PATH='../../tools/dataset/short_input_end_id_medusa.csv' MEDUSA_OUTPUT_IDS_PATH='../../tools/dataset/short_output_end_id_medusa.csv' launch_triton_server run_cpp_trtllm_backend_tests ${MEDUSA_INPUT_IDS_PATH} ${MEDUSA_OUTPUT_IDS_PATH} ${END_ID_MEDUSA} kill_triton_server # FIXME: grpc e2e test returns different result (because it is Medusa and not GPT) and has some problems with spaces # Test non-streaming DECOUPLED_MODE="False" launch_triton_server # Test client pushd inflight_batcher_llm/client python3 inflight_batcher_llm_client.py \ --request-output-len=128 \ --end-id ${END_ID_MEDUSA} \ --request-id 1 \ --tokenizer-dir ${TOKENIZER_PATH} \ --input-tokens-csv ${MEDUSA_INPUT_IDS_PATH} \ --output-tokens-csv ${MEDUSA_OUTPUT_IDS_PATH} \ --check-output popd # inflight_batcher_llm/client kill_triton_server fi if [ "$MODEL" = "eagle" ]; then # To make sure that torch is not a dependency for C++ backend # pip3 uninstall -y torch # Test streaming DECOUPLED_MODE="True" STREAMING="true" run_all_tests="true" MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" BATCHING_STRATEGY="${BATCHING_STRATEGIES[0]}" # chunked context is not supported yet. ENABLE_CHUNKED_CONTEXT="false" DECODING_MODE="eagle" END_ID_EAGLE=1284 # Use the same I/O files as eagle is based on the same vicuna-v1.3-7b as medusa. EAGLE_INPUT_IDS_PATH='../../tools/dataset/short_input_end_id_medusa.csv' EAGLE_OUTPUT_IDS_PATH='../../tools/dataset/short_output_end_id_eagle.csv' for BACKEND in "${BACKENDS[@]}"; do launch_triton_server run_cpp_trtllm_backend_tests ${EAGLE_INPUT_IDS_PATH} ${EAGLE_OUTPUT_IDS_PATH} ${END_ID_EAGLE} kill_triton_server done # FIXME: grpc e2e test returns different result (because it is Eagle and not GPT) and has some problems with spaces # Test non-streaming DECOUPLED_MODE="False" launch_triton_server # Test client pushd inflight_batcher_llm/client python3 inflight_batcher_llm_client.py \ --request-output-len=128 \ --end-id ${END_ID_EAGLE} \ --request-id 1 \ --tokenizer-dir ${TOKENIZER_PATH} \ --input-tokens-csv ${EAGLE_INPUT_IDS_PATH} \ --output-tokens-csv ${EAGLE_OUTPUT_IDS_PATH} \ --check-output popd # inflight_batcher_llm/client kill_triton_server fi if [ "$MODEL" = "bart-ib" ] || [ "$MODEL" = "t5-ib" ]; then # Non-streaming tests, decoupled is false DECOUPLED_MODE="False" STREAMING="false" # enc-dec models only support inflight_fused_batching, with chunked context disabled CHECK_PERF_JSON_ARGS="" BATCHING_STRATEGY="inflight_fused_batching" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" ENABLE_CHUNKED_CONTEXT="false" CROSS_KV_CACHE_FRACTION="0.5" # ------------------------------- # Param sweep test # ------------------------------- run_all_tests="true" for BACKEND in "${BACKENDS[@]}"; do for MAX_TOKENS_IN_KV_CACHE in "${MAX_TOKENS_IN_KV_CACHES[@]}"; do for KV_CACHE_FREE_GPU_MEM_FRACTION in "${KV_CACHE_FREE_GPU_MEM_FRACTIONS[@]}"; do # Because the runners are shared, the default value of 0.9 doesn't work, so skip # if max_tokens_in_kv_cache is also empty if [[ "${KV_CACHE_FREE_GPU_MEM_FRACTION}" == "" && "${MAX_TOKENS_IN_KV_CACHE}" == "" ]]; then continue fi #Encoder-decoder models are not yet supported in python backend if [[ "${BACKEND}" == "python" ]]; then continue fi launch_triton_server run_cpp_trtllm_backend_tests run_cpp_e2e_backend_tests kill_triton_server run_all_tests="false" done done done BACKEND="${BACKENDS[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" # ------------------------------- # Exclude input in output test # ------------------------------- EXCLUDE_INPUT_IN_OUTPUT="true" run_all_tests="false" launch_triton_server run_cpp_trtllm_backend_tests run_cpp_e2e_backend_tests kill_triton_server EXCLUDE_INPUT_IN_OUTPUT="false" # ------------------------------- # Max queue delay microseconds # ------------------------------- run_all_tests="false" MAX_QUEUE_DELAY_MICROSECONDS="1000000" launch_triton_server run_cpp_trtllm_backend_tests run_cpp_e2e_backend_tests kill_triton_server MAX_QUEUE_DELAY_MICROSECONDS="0" # ------------------------------- # Python BLS # ------------------------------- ACCUMULATE_TOKENS=( "false" "true" ) E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" ) for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then continue fi launch_triton_server run_cpp_e2e_backend_tests kill_triton_server done done E2E_MODEL_NAME="ensemble" ACCUMULATE_TOKEN="false" # Reset CROSS_KV_CACHE_FRACTION="" fi if [ "$MODEL" = "blip2-opt" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" # Test none-streaming DECOUPLED_MODE="False" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do launch_triton_server python3 tools/multimodal/client.py --model_type blip2 | tee multimodal_output grep -oi "singapore" multimodal_output kill_triton_server done # Test streaming DECOUPLED_MODE="True" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do launch_triton_server python3 tools/multimodal/client.py --model_type blip2 --streaming | tee multimodal_output grep -oi "sing" multimodal_output kill_triton_server done DECOUPLED_MODE="False" # Python BLS DECOUPLED_MODE="True" ACCUMULATE_TOKENS=( "false" "true" ) E2E_MODEL_NAMES=( "ensemble" "tensorrt_llm_bls" ) for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do for E2E_MODEL_NAME in "${E2E_MODEL_NAMES[@]}"; do for ACCUMULATE_TOKEN in "${ACCUMULATE_TOKENS[@]}"; do if [[ "${E2E_MODEL_NAME}" == "ensemble" && "${ACCUMULATE_TOKEN}" == "true" ]]; then continue fi launch_triton_server python3 tools/multimodal/client.py --model_type blip2 --use_bls --streaming | tee multimodal_output grep -oi "sing" multimodal_output kill_triton_server done done done E2E_MODEL_NAME="ensemble" ACCUMULATE_TOKEN="false" DECOUPLED_MODE="False" # Test kv cache reuse ENABLE_KV_CACHE_REUSE="True" for BATCHING_STRATEGY in "${BATCHING_STRATEGIES[@]}"; do launch_triton_server python3 tools/multimodal/client.py --text "Question: Can you identify which city is depicted in this image based on the landmarks, architecture, and overall scenery? Please provide the name of the city along with any notable features that led you to your conclusion. Answer:" --model_type blip2 --prompt_table_extra_id 1 python3 tools/multimodal/client.py --text "Question: Can you identify which city is depicted in this image based on the landmarks, architecture, and overall scenery? Please provide the name of the city along with any notable features that led you to your conclusion. Answer:" --model_type blip2 --prompt_table_extra_id 1 | tee multimodal_output grep -oi "singapore" multimodal_output kill_triton_server done ENABLE_KV_CACHE_REUSE="False" fi if [ "$MODEL" = "mllama" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" CROSS_KV_CACHE_FRACTION="0.5" BATCHING_STRATEGY="inflight_fused_batching" # Test none-streaming DECOUPLED_MODE="False" for BACKEND in "${BACKENDS[@]}"; do if [[ "${BACKEND}" == "python" ]]; then continue fi launch_triton_server python3 tools/multimodal/client.py --model_type mllama | tee multimodal_output grep -oi "singapore" multimodal_output kill_triton_server done # Test streaming DECOUPLED_MODE="True" for BACKEND in "${BACKENDS[@]}"; do launch_triton_server python3 tools/multimodal/client.py --model_type mllama --streaming | tee multimodal_output grep -oi "singapore" multimodal_output kill_triton_server done DECOUPLED_MODE="False" CROSS_KV_CACHE_FRACTION="" fi if [ "$MODEL" = "whisper" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[1]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" # enc-dec models only support inflight_fused_batching, with chunked context disabled BATCHING_STRATEGY="inflight_fused_batching" ENABLE_CHUNKED_CONTEXT="false" EXCLUDE_INPUT_IN_OUTPUT="true" CROSS_KV_CACHE_FRACTION="0.5" wget -nc https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav # Test none-streaming DECOUPLED_MODE="False" pip install tiktoken soundfile launch_triton_server python3 tools/whisper/client.py --audio-path 1221-135766-0002.wav kill_triton_server # Test streaming DECOUPLED_MODE="True" launch_triton_server python3 tools/whisper/client.py --audio-path 1221-135766-0002.wav --streaming kill_triton_server EXCLUDE_INPUT_IN_OUTPUT="false" DECOUPLED_MODE="False" CROSS_KV_CACHE_FRACTION="" fi if [ "$MODEL" = "llava_onevision" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" # Test none-streaming DECOUPLED_MODE="False" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type llava_onevision --end-id 151645 --pad-id 151643 | tee multimodal_output grep -oi "singapore" multimodal_output kill_triton_server # Test streaming DECOUPLED_MODE="True" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type llava_onevision --streaming --end-id 151645 --pad-id 151643 | tee multimodal_output grep -oi "sing" multimodal_output kill_triton_server # Test with video input DECOUPLED_MODE="False" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server VIDEO_PATH=$TOKENIZER_PATH'/../video-neva/test_video/video_test.mp4' python3 tools/multimodal/client.py --model_type llava_onevision --end-id 151645 --pad-id 151643 --text "What is in this video?" --video $VIDEO_PATH --video_num_frames 8 | tee multimodal_output grep -oi "robotic hand" multimodal_output kill_triton_server fi if [ "$MODEL" = "qwen2_vl" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" # Test none-streaming DECOUPLED_MODE="False" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type qwen2_vl --end-id 151645 --pad-id 151643 | tee multimodal_output grep -oi "Singapore" multimodal_output kill_triton_server # Test streaming DECOUPLED_MODE="True" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type qwen2_vl --streaming --end-id 151645 --pad-id 151643 | tee multimodal_output grep -oi "Singapore" multimodal_output kill_triton_server fi if [ "$MODEL" = "llava" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[1]}" ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[1]}" echo "ENABLE_CHUNKED_CONTEXT: ${ENABLE_CHUNKED_CONTEXT}" # Test none-streaming DECOUPLED_MODE="False" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type llava --end-id 2 --pad-id 32001 | tee multimodal_output grep -oi "Singapore" multimodal_output kill_triton_server # Test streaming DECOUPLED_MODE="True" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type llava --streaming --end-id 2 --pad-id 32001 | tee multimodal_output grep -oi "Singapore" multimodal_output kill_triton_server fi if [ "$MODEL" = "llava_fp8" ]; then MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[1]}" ENABLE_CHUNKED_CONTEXT="${ENABLE_CHUNKED_CONTEXTS[1]}" echo "ENABLE_CHUNKED_CONTEXT: ${ENABLE_CHUNKED_CONTEXT}" # Test none-streaming DECOUPLED_MODE="False" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type llava --end-id 2 --pad-id 32001 | tee multimodal_output grep -oi "Singapore" multimodal_output kill_triton_server # Test streaming DECOUPLED_MODE="True" BATCHING_STRATEGY="inflight_fused_batching" launch_triton_server python3 tools/multimodal/client.py --model_type llava --streaming --end-id 2 --pad-id 32001 | tee multimodal_output grep -oi "Singapore" multimodal_output kill_triton_server fi if [ "$MODEL" = "gpt-ib-lad" ]; then # Test streaming DECOUPLED_MODE="False" STREAMING="true" run_all_tests="true" MAX_NUM_SEQUENCE="${MAX_NUM_SEQUENCES[0]}" MAX_TOKENS_IN_KV_CACHE="${MAX_TOKENS_IN_KV_CACHES[0]}" BATCH_SCHEDULER_POLICY="${BATCH_SCHEDULER_POLICIES[0]}" KV_CACHE_FREE_GPU_MEM_FRACTION="${KV_CACHE_FREE_GPU_MEM_FRACTIONS[0]}" BATCHING_STRATEGY="${BATCHING_STRATEGIES[0]}" DECODING_MODE="lookahead" # Lookahead parameters LOOKAHEAD_WINDOW_SIZE=7 LOOKAHEAD_NGRAM_SIZE=7 LOOKAHEAD_VERIFICATION_SET_SIZE=7 LOOKAHEAD_CONFIG="--lookahead_config=[${LOOKAHEAD_WINDOW_SIZE},${LOOKAHEAD_NGRAM_SIZE},${LOOKAHEAD_VERIFICATION_SET_SIZE}]" launch_triton_server # Test client pushd inflight_batcher_llm/client python3 inflight_batcher_llm_client.py \ ${LOOKAHEAD_CONFIG} \ --tokenizer-dir ${TOKENIZER_PATH} popd # inflight_batcher_llm/client kill_triton_server fi popd # $LLM_BACKEND_ROOT