mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
4093 lines
155 KiB
Python
4093 lines
155 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
|
|
import pytest
|
|
import torch
|
|
import yaml
|
|
|
|
from .build_engines import *
|
|
from .common import *
|
|
from .conftest import find_repo_root, venv_check_call, venv_check_output
|
|
from .trt_test_alternative import call, check_call, print_info
|
|
|
|
LLM_ROOT = os.environ.get("LLM_ROOT", find_repo_root())
|
|
sys.path.append(os.path.join(LLM_ROOT, "triton_backend"))
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def stop_triton_server():
|
|
# Make sure Triton server are killed before each test.
|
|
call(f"pkill -9 -f tritonserver", shell=True)
|
|
call(f"pkill -9 -f trtllmExecutorWorker", shell=True)
|
|
time.sleep(2)
|
|
yield
|
|
# Gracefully terminate Triton Server after each test.
|
|
call(f"pkill -f tritonserver", shell=True)
|
|
call(f"pkill -f trtllmExecutorWorker", shell=True)
|
|
time.sleep(8)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["True", "False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False", "True"],
|
|
ids=["disableTrtOverlap", "enableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("FEATURE_NAME", [
|
|
"test_basic", "batched_inputs", "test_log_probs", "test_request_id",
|
|
"test_stop_words", "test_embedding_bias", "test_n_returns"
|
|
])
|
|
def test_llama_v2_7b_ifb(
|
|
E2E_MODEL_NAME,
|
|
FEATURE_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v2_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if BATCHING_STRATEGY == "V1" and FEATURE_NAME == "test_embedding_bias":
|
|
pytest.skip("Skipping. V1 doesn't support embedding_bias tensor yet.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH = prepare_llama_v2_7b_engine("ifb",
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v2_tokenizer_model_root)
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llama_v2_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm",
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME="",
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
feature_name = f"{FEATURE_NAME}"
|
|
tokenizer_dir = f"{llama_v2_tokenizer_model_root}"
|
|
|
|
if DECOUPLED_MODE == "False":
|
|
run_cpp_backend_tests(feature_name, llm_backend_venv,
|
|
inflight_batcher_llm_client_root, tokenizer_dir)
|
|
else:
|
|
test_model_name = ""
|
|
if ACCUMULATE_TOKEN == "True" and E2E_MODEL_NAME == "tensorrt_llm_bls":
|
|
test_model_name = "llama_v2_7b"
|
|
|
|
run_cpp_streaming_backend_tests(feature_name,
|
|
llm_backend_venv,
|
|
inflight_batcher_llm_client_root,
|
|
tokenizer_dir,
|
|
model_name=test_model_name,
|
|
e2e_model=E2E_MODEL_NAME)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", ["4096"])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_mistral_v1_7b_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_llama_example_root,
|
|
mistral_v1_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_mistral_v1_7b_engine("ifb",
|
|
tensorrt_llm_llama_example_root,
|
|
mistral_v1_tokenizer_model_root)
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = mistral_v1_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --force --world_size 1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={mistral_v1_tokenizer_model_root}",
|
|
"--tokenizer-type=llama",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", ["4096"])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_mistral_v1_multi_models(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_llama_example_root,
|
|
mistral_v1_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_mistral_v1_7b_engine("ifb",
|
|
tensorrt_llm_llama_example_root,
|
|
mistral_v1_tokenizer_model_root)
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = mistral_v1_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call((f"python3 {launch_server_py} --force --world_size 1 "
|
|
f"--model_repo={new_model_repo} --multi-model"),
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={mistral_v1_tokenizer_model_root}",
|
|
"--tokenizer-type=llama",
|
|
"--model-name=tensorrt_llm",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("TEST_TYPE", ["e2e", "accuracy"])
|
|
def test_mistral_v1_7b_python_backend(
|
|
TEST_TYPE,
|
|
llm_backend_gpt_example_root,
|
|
mistral_v1_tokenizer_model_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llm_backend_venv,
|
|
):
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_mistral_v1_7b_engine("python_backend",
|
|
tensorrt_llm_llama_example_root,
|
|
mistral_v1_tokenizer_model_root)
|
|
# Prepare model repo
|
|
origin_model_repo = os.path.join(llm_backend_repo_root, "all_models", "gpt")
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
check_call(f"rm -rf {new_model_repo}", shell=True)
|
|
check_call(f"cp -R {origin_model_repo} {new_model_repo}", shell=True)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = mistral_v1_tokenizer_model_root
|
|
fill_template_py = os.path.join(llm_backend_repo_root, "tools",
|
|
"fill_template.py")
|
|
llm_config = os.path.join(llm_backend_repo_root, "triton_repo",
|
|
"tensorrt_llm", "config.pbtxt")
|
|
preprocessing_config = os.path.join(llm_backend_repo_root, "triton_repo",
|
|
"preprocessing", "config.pbtxt")
|
|
postprocessing_config = os.path.join(llm_backend_repo_root, "triton_repo",
|
|
"postprocessing", "config.pbtxt")
|
|
check_call(
|
|
f"python3 {fill_template_py} -i {llm_config} engine_dir:{ENGINE_PATH}",
|
|
shell=True)
|
|
check_call(
|
|
f"python3 {fill_template_py} -i {preprocessing_config} tokenizer_dir:{TOKENIZER_PATH}",
|
|
shell=True)
|
|
check_call(
|
|
f"python3 {fill_template_py} -i {postprocessing_config} tokenizer_dir:{TOKENIZER_PATH}",
|
|
shell=True)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
if TEST_TYPE == "e2e":
|
|
run_cmd = [
|
|
f"{llm_backend_gpt_example_root}/end_to_end_test.py",
|
|
f"--tokenizer_dir={TOKENIZER_PATH}",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
elif TEST_TYPE == "accuracy":
|
|
run_cmd = [
|
|
f"{llm_backend_gpt_example_root}/client.py",
|
|
"--text=Born in north-east France, Soyer trained as a",
|
|
"--output_len=10",
|
|
f"--tokenizer_dir={TOKENIZER_PATH}",
|
|
]
|
|
|
|
output = venv_check_output(llm_backend_venv,
|
|
run_cmd).strip().split("\n")[-1]
|
|
|
|
print_info(output)
|
|
|
|
|
|
@pytest.mark.skip_less_device(8)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_llama_v2_70b_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v2_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_llama_v2_70b_engine("ifb",
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v2_tokenizer_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llama_v2_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=8 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={llama_v2_tokenizer_model_root}",
|
|
"--tokenizer-type=llama",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.skip_less_device(8)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["lookahead"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("EXECUTOR_LOOKAHEAD_WINDOW", ["7"])
|
|
@pytest.mark.parametrize("EXECUTOR_LOOKAHEAD_NGRAM", ["7"])
|
|
@pytest.mark.parametrize("EXECUTOR_LOOKAHEAD_VERIFICATION_SET", ["7"])
|
|
def test_llama_v2_70b_ifb_lad(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
EXECUTOR_LOOKAHEAD_WINDOW,
|
|
EXECUTOR_LOOKAHEAD_NGRAM,
|
|
EXECUTOR_LOOKAHEAD_VERIFICATION_SET,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v2_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_llama_v2_70b_engine("ifb",
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v2_tokenizer_model_root,
|
|
use_lad=True)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llama_v2_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXECUTOR_LOOKAHEAD_WINDOW=EXECUTOR_LOOKAHEAD_WINDOW,
|
|
EXECUTOR_LOOKAHEAD_NGRAM=EXECUTOR_LOOKAHEAD_NGRAM,
|
|
EXECUTOR_LOOKAHEAD_VERIFICATION_SET=EXECUTOR_LOOKAHEAD_VERIFICATION_SET,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=8 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={llama_v2_tokenizer_model_root}",
|
|
"--tokenizer-type=llama",
|
|
f"--lookahead_config=[{EXECUTOR_LOOKAHEAD_WINDOW}, {EXECUTOR_LOOKAHEAD_NGRAM}, {EXECUTOR_LOOKAHEAD_VERIFICATION_SET}]"
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["medusa"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_medusa_vicuna_7b_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_medusa_example_root,
|
|
vicuna_7b_model_root,
|
|
medusa_vicuna_7b_model_root,
|
|
llama_v2_tokenizer_model_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_medusa_vicuna_7b_engine(
|
|
tensorrt_llm_medusa_example_root, vicuna_7b_model_root,
|
|
medusa_vicuna_7b_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llama_v2_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
# Allow the output of the medusa model to be somewhat different from the output of the base model
|
|
# This is a known issue, because starting medusa may select a different kernel
|
|
correctness_threshold = 0.7
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
"--request-output-len=128", "--end-id=1284", "--request-id=1",
|
|
f"--tokenizer-dir={llama_v2_tokenizer_model_root}",
|
|
f"--input-tokens-csv={llm_backend_dataset_root}/short_input_end_id_medusa.csv",
|
|
f"--output-tokens-csv={llm_backend_dataset_root}/short_output_end_id_medusa.csv",
|
|
"--check-output", f"--correctness-threshold={correctness_threshold}"
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["eagle"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_eagle_vicuna_7b_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_eagle_example_root,
|
|
vicuna_7b_model_root,
|
|
eagle_vicuna_7b_model_root,
|
|
llama_v2_tokenizer_model_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_eagle_vicuna_7b_engine(
|
|
tensorrt_llm_eagle_example_root, vicuna_7b_model_root,
|
|
eagle_vicuna_7b_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llama_v2_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
# Allow the output of the eagle model to be somewhat different from the output of the base model
|
|
# This is a known issue, because starting eagle may select a different kernel
|
|
correctness_threshold = 0.7
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
"--request-output-len=128",
|
|
"--end-id=1284",
|
|
"--request-id=1",
|
|
f"--tokenizer-dir={llama_v2_tokenizer_model_root}",
|
|
# We use the same i/o as medusa here as eagle is based on the same vicuna-1.3-7b model
|
|
f"--input-tokens-csv={llm_backend_dataset_root}/short_input_end_id_medusa.csv",
|
|
f"--output-tokens-csv={llm_backend_dataset_root}/short_output_end_id_medusa.csv",
|
|
"--check-output",
|
|
f"--correctness-threshold={correctness_threshold}"
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("TEST_TYPE", ["e2e", "accuracy"])
|
|
def test_gpt_350m_python_backend(
|
|
TEST_TYPE,
|
|
llm_backend_gpt_example_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH = prepare_gpt_350m_engine(
|
|
"python_backend",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
|
|
# Prepare model repo
|
|
origin_model_repo = os.path.join(llm_backend_repo_root, "all_models", "gpt")
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
check_call(f"rm -rf {new_model_repo}", shell=True)
|
|
check_call(f"cp -R {origin_model_repo} {new_model_repo}", shell=True)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
fill_template_py = os.path.join(llm_backend_repo_root, "tools",
|
|
"fill_template.py")
|
|
llm_config = os.path.join(llm_backend_repo_root, "triton_repo",
|
|
"tensorrt_llm", "config.pbtxt")
|
|
preprocessing_config = os.path.join(llm_backend_repo_root, "triton_repo",
|
|
"preprocessing", "config.pbtxt")
|
|
postprocessing_config = os.path.join(llm_backend_repo_root, "triton_repo",
|
|
"postprocessing", "config.pbtxt")
|
|
check_call(
|
|
f"python3 {fill_template_py} -i {llm_config} engine_dir:{ENGINE_PATH}",
|
|
shell=True)
|
|
check_call(
|
|
f"python3 {fill_template_py} -i {preprocessing_config} tokenizer_dir:{TOKENIZER_PATH}",
|
|
shell=True)
|
|
check_call(
|
|
f"python3 {fill_template_py} -i {postprocessing_config} tokenizer_dir:{TOKENIZER_PATH}",
|
|
shell=True)
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
if TEST_TYPE == "e2e":
|
|
run_cmd = [
|
|
f"{llm_backend_gpt_example_root}/end_to_end_test.py",
|
|
f"--tokenizer_dir={TOKENIZER_PATH}",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
elif TEST_TYPE == "accuracy":
|
|
run_cmd = [
|
|
f"{llm_backend_gpt_example_root}/client.py",
|
|
"--text=Born in north-east France, Soyer trained as a",
|
|
"--output_len=10",
|
|
f"--tokenizer_dir={TOKENIZER_PATH}",
|
|
]
|
|
|
|
output = venv_check_output(llm_backend_venv,
|
|
run_cmd).strip().split("\n")[-1]
|
|
|
|
print_info(output)
|
|
check_server_metrics()
|
|
|
|
# Validate Accuracy -ToDo
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["True", "False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["", "top_k_top_p"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("FEATURE_NAME", [
|
|
"test_basic", "batched_inputs", "test_log_probs", "test_request_id",
|
|
"test_stop_words", "test_embedding_bias"
|
|
])
|
|
def test_gpt_350m_ifb(
|
|
E2E_MODEL_NAME,
|
|
FEATURE_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if BATCHING_STRATEGY == "V1" and FEATURE_NAME == "test_embedding_bias":
|
|
pytest.skip("Skipping. V1 doesn't support embedding_bias tensor yet.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH = prepare_gpt_350m_engine(
|
|
"ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm",
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME="",
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
feature_name = f"{FEATURE_NAME}"
|
|
tokenizer_dir = f"{gpt_tokenizer_model_root}"
|
|
|
|
if DECOUPLED_MODE == "False":
|
|
run_cpp_backend_tests(feature_name, llm_backend_venv,
|
|
inflight_batcher_llm_client_root, tokenizer_dir)
|
|
else:
|
|
test_model_name = ""
|
|
if ACCUMULATE_TOKEN == "True" and E2E_MODEL_NAME == "tensorrt_llm_bls":
|
|
test_model_name = "gpt_350m"
|
|
|
|
run_cpp_streaming_backend_tests(feature_name,
|
|
llm_backend_venv,
|
|
inflight_batcher_llm_client_root,
|
|
tokenizer_dir,
|
|
model_name=test_model_name,
|
|
e2e_model=E2E_MODEL_NAME)
|
|
|
|
if feature_name == "test_basic":
|
|
check_server_metrics(batching_strategy=BATCHING_STRATEGY,
|
|
kv_cache_reuse=ENABLE_KV_CACHE_REUSE)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["True", "False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", ["4096"])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("CROSS_KV_CACHE_FRACTION", [""])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["", "top_k_top_p"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["True", "False"])
|
|
@pytest.mark.parametrize("FEATURE_NAME", ["test_basic"])
|
|
def test_t5_small_enc_dec_ifb(
|
|
E2E_MODEL_NAME,
|
|
FEATURE_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
CROSS_KV_CACHE_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_enc_dec_example_root,
|
|
t5_small_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if BATCHING_STRATEGY == "V1" and FEATURE_NAME == "test_embedding_bias":
|
|
pytest.skip("Skipping. V1 doesn't support embedding_bias tensor yet.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENCODER_ENGINE_DIR, ENGINE_DIR = prepare_t5_small_engine(
|
|
tensorrt_llm_enc_dec_example_root, t5_small_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = t5_small_model_root
|
|
if CROSS_KV_CACHE_FRACTION == "":
|
|
CROSS_KV_CACHE_FRACTION = "0.5"
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
ENCODER_ENGINE_PATH=ENCODER_ENGINE_DIR,
|
|
CROSS_KV_CACHE_FRACTION=CROSS_KV_CACHE_FRACTION,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
feature_name = f"{FEATURE_NAME}"
|
|
|
|
if DECOUPLED_MODE == "False":
|
|
run_cpp_backend_tests(feature_name, llm_backend_venv,
|
|
inflight_batcher_llm_client_root, TOKENIZER_PATH)
|
|
else:
|
|
run_cpp_streaming_backend_tests(feature_name, llm_backend_venv,
|
|
inflight_batcher_llm_client_root,
|
|
TOKENIZER_PATH)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["True", "False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", ["24000"])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("CROSS_KV_CACHE_FRACTION", ["0.5"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["top_k_top_p"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["True"])
|
|
def test_whisper_large_v3_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
CROSS_KV_CACHE_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
llm_backend_whisper_example_root,
|
|
tensorrt_llm_whisper_example_root,
|
|
whisper_large_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if BATCHING_STRATEGY == "V1" and FEATURE_NAME == "test_embedding_bias":
|
|
pytest.skip("Skipping. V1 doesn't support embedding_bias tensor yet.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENCODER_ENGINE_DIR, ENGINE_DIR = prepare_whisper_large_engine(
|
|
tensorrt_llm_whisper_example_root, whisper_large_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root,
|
|
new_model_repo,
|
|
model_name="whisper")
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = whisper_large_model_root
|
|
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
ENCODER_ENGINE_PATH=ENCODER_ENGINE_DIR,
|
|
CROSS_KV_CACHE_FRACTION=CROSS_KV_CACHE_FRACTION,
|
|
)
|
|
|
|
#####Whisper Specific#####
|
|
# Delete useless triton repo
|
|
check_call(f"rm -rf {new_model_repo}/preprocessing", shell=True)
|
|
check_call(f"rm -rf {new_model_repo}/postprocessing", shell=True)
|
|
check_call(f"rm -rf {new_model_repo}/ensemble", shell=True)
|
|
check_call(f"rm -rf {new_model_repo}/tensorrt_llm_bls", shell=True)
|
|
|
|
# Copy tiktoken and npz to triton repo
|
|
check_call(
|
|
f"cp -vf {whisper_large_model_root}/multilingual.tiktoken {new_model_repo}/whisper_bls/1",
|
|
shell=True)
|
|
check_call(
|
|
f"cp -vf {whisper_large_model_root}/mel_filters.npz {new_model_repo}/whisper_bls/1",
|
|
shell=True)
|
|
|
|
# Install 3rd party libs
|
|
check_call(f"pip3 install tiktoken soundfile", shell=True)
|
|
##########################
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{llm_backend_whisper_example_root}/client.py",
|
|
f"--audio-path={whisper_large_model_root}/1221-135766-0002.wav",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("TEST_TYPE", ["e2e", "client"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["True", "False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_gpt_gather_logits_ifb(
|
|
TEST_TYPE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
llm_backend_inflight_batcher_llm_root,
|
|
llm_backend_dataset_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH = prepare_gpt_gather_logits_engine(
|
|
"ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
print(
|
|
f"Launching Triton Server with command: {launch_server_py} --world_size=1 --model_repo={new_model_repo}"
|
|
)
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
if TEST_TYPE == "client":
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={gpt_tokenizer_model_root}",
|
|
"--return-context-logits", "--return-generation-logits"
|
|
]
|
|
elif TEST_TYPE == "e2e":
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/end_to_end_test.py",
|
|
"-i=http",
|
|
"--max-input-len=192",
|
|
f"--dataset={llm_backend_dataset_root}/mini_cnn_eval.json",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["True"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_gpt_350m_speculative_decoding(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
llm_backend_inflight_batcher_llm_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1":
|
|
pytest.skip("Skipping. Speculative decoding is not supported in V1.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
CONTROL_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"medium_control_ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
)
|
|
TARGET_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"medium_target_ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
)
|
|
DRAFT_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_draft")
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_target")
|
|
|
|
# Modify config.pbtxt
|
|
ENABLE_KV_CACHE_REUSE = "True"
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
CONTROL_ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
DRAFT_ENGINE_PATH=DRAFT_ENGINE_DIR,
|
|
TARGET_ENGINE_PATH=TARGET_ENGINE_DIR,
|
|
)
|
|
|
|
# Launch First server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready(http_port="8000")
|
|
|
|
## second suit
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_draft")
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_target")
|
|
|
|
ENABLE_KV_CACHE_REUSE = "False"
|
|
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
CONTROL_ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
DRAFT_ENGINE_PATH=DRAFT_ENGINE_DIR,
|
|
TARGET_ENGINE_PATH=TARGET_ENGINE_DIR,
|
|
)
|
|
|
|
## Launch second server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo} " \
|
|
f"--grpc_port=8004 --http_port=8003 --metrics_port=8005",
|
|
shell=True)
|
|
check_server_ready(http_port="8003")
|
|
|
|
# Run Test
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME = "tensorrt_llm_draft"
|
|
TENSORRT_LLM_TARGET_MODEL_NAME = "tensorrt_llm_target"
|
|
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/speculative_decoding_test.py",
|
|
"--max-input-len=200",
|
|
f"--dataset={llm_backend_dataset_root}/mini_cnn_eval_spec_decoding.json",
|
|
"--url-draft=0.0.0.0:8004",
|
|
"--url-target=0.0.0.0:8001",
|
|
"--url-control=0.0.0.0:8001",
|
|
f"--draft-tensorrt-llm-model-name={TENSORRT_LLM_DRAFT_MODEL_NAME}",
|
|
f"--target-tensorrt-llm-model-name={TENSORRT_LLM_TARGET_MODEL_NAME}",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["True"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_gpt_350m_speculative_decoding_return_logits(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
llm_backend_inflight_batcher_llm_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1":
|
|
pytest.skip("Skipping. Speculative decoding is not supported in V1.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
CONTROL_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"medium_control_ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
)
|
|
TARGET_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"medium_target_ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
)
|
|
DRAFT_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_draft")
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_target")
|
|
|
|
# Modify config.pbtxt
|
|
ENABLE_KV_CACHE_REUSE = "True"
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
CONTROL_ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
DRAFT_ENGINE_PATH=DRAFT_ENGINE_DIR,
|
|
TARGET_ENGINE_PATH=TARGET_ENGINE_DIR,
|
|
)
|
|
|
|
# Launch First server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready(http_port="8000")
|
|
|
|
## second suit
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_draft")
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_target")
|
|
|
|
ENABLE_KV_CACHE_REUSE = "False"
|
|
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
CONTROL_ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
DRAFT_ENGINE_PATH=DRAFT_ENGINE_DIR,
|
|
TARGET_ENGINE_PATH=TARGET_ENGINE_DIR,
|
|
)
|
|
|
|
## Launch second server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo} " \
|
|
f"--grpc_port=8004 --http_port=8003 --metrics_port=8005",
|
|
shell=True)
|
|
check_server_ready(http_port="8003")
|
|
# Run Test
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME = "tensorrt_llm_draft"
|
|
TENSORRT_LLM_TARGET_MODEL_NAME = "tensorrt_llm_target"
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/speculative_decoding_test.py",
|
|
"--max-input-len=128",
|
|
f"--dataset={llm_backend_dataset_root}/mini_cnn_eval_spec_decoding.json",
|
|
"--url-draft=0.0.0.0:8004",
|
|
"--url-target=0.0.0.0:8001",
|
|
"--url-control=0.0.0.0:8001",
|
|
"--num-draft-tokens=5",
|
|
"--return-target-model-accepted-token-logits",
|
|
"--return-draft-model-draft-logits",
|
|
"--verbose",
|
|
f"--draft-tensorrt-llm-model-name={TENSORRT_LLM_DRAFT_MODEL_NAME}",
|
|
f"--target-tensorrt-llm-model-name={TENSORRT_LLM_TARGET_MODEL_NAME}",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["guaranteed_no_evict", "max_utilization"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["True"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("USE_DRAFT_LOGITS_VALUES", ["True", "False"])
|
|
def test_gpt_speculative_decoding_bls(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
USE_DRAFT_LOGITS_VALUES,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
llm_backend_inflight_batcher_llm_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1":
|
|
pytest.skip("Skipping. Speculative decoding is not supported in V1.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
CONTROL_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"medium_control_ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
)
|
|
TARGET_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"medium_target_ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt2_medium_tokenizer_model_root,
|
|
)
|
|
DRAFT_ENGINE_DIR = prepare_gpt_350m_engine(
|
|
"ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_draft")
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_target")
|
|
|
|
# Modify config.pbtxt
|
|
ENABLE_KV_CACHE_REUSE = "True"
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
CONTROL_ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
DRAFT_ENGINE_PATH=DRAFT_ENGINE_DIR,
|
|
TARGET_ENGINE_PATH=TARGET_ENGINE_DIR,
|
|
)
|
|
|
|
# Launch Triton server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready(http_port="8000")
|
|
|
|
# Run Test
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME = "tensorrt_llm_draft"
|
|
TENSORRT_LLM_TARGET_MODEL_NAME = "tensorrt_llm_target"
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/speculative_decoding_test.py",
|
|
"--max-input-len=200",
|
|
f"--dataset={llm_backend_dataset_root}/mini_cnn_eval_spec_decoding.json",
|
|
"--url-target=0.0.0.0:8001",
|
|
"--url-draft=0.0.0.0:8001",
|
|
"--url-control=0.0.0.0:8001",
|
|
f"--draft-tensorrt-llm-model-name={TENSORRT_LLM_DRAFT_MODEL_NAME}",
|
|
f"--target-tensorrt-llm-model-name={TENSORRT_LLM_TARGET_MODEL_NAME}",
|
|
"--bls-speculative-tensorrt-llm-model-name=tensorrt_llm_bls",
|
|
"--execute-bls-speculative-decoding",
|
|
"--num-draft-tokens=5",
|
|
"--verbose",
|
|
]
|
|
|
|
if USE_DRAFT_LOGITS_VALUES == "True":
|
|
run_cmd += [
|
|
"--return-generation-logits",
|
|
"--use-draft-logits",
|
|
"--disable-output-comparison",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.skip_less_device(8)
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["True"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("USE_DRAFT_LOGITS_VALUES", ["True", "False"])
|
|
@pytest.mark.parametrize("DATA_TYPE", ["fp8", "bfloat16"])
|
|
def test_llama_v3_speculative_decoding_bls(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
USE_DRAFT_LOGITS_VALUES,
|
|
DATA_TYPE,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v3_8b_model_root,
|
|
llama_v3_70b_model_root,
|
|
tensorrt_llm_example_root,
|
|
llm_backend_inflight_batcher_llm_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
if DATA_TYPE == "fp8" and getSMVersion() < 89:
|
|
pytest.skip("Skipping fp8 test on pre-Ada architecture")
|
|
|
|
if BATCHING_STRATEGY == "V1":
|
|
pytest.skip("Skipping. Speculative decoding is not supported in V1.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
DRAFT_ENGINE_DIR = prepare_llama_v3_8b_engine(
|
|
tensorrt_llm_example_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v3_8b_model_root,
|
|
data_type=DATA_TYPE)
|
|
CONTROL_ENGINE_DIR = prepare_llama_v3_70b_engine(
|
|
"control_ifb",
|
|
tensorrt_llm_example_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v3_70b_model_root,
|
|
data_type=DATA_TYPE)
|
|
TARGET_ENGINE_DIR = prepare_llama_v3_70b_engine(
|
|
"target_ifb",
|
|
tensorrt_llm_example_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llama_v3_70b_model_root,
|
|
data_type=DATA_TYPE)
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_draft")
|
|
prepare_custom_config(llm_backend_repo_root, new_model_repo,
|
|
"tensorrt_llm_target")
|
|
|
|
# Modify config.pbtxt
|
|
ENABLE_KV_CACHE_REUSE = "True"
|
|
PARTICIPANT_IDS_DRAFT = "1\\,2\\,3\\,4\\,5\\,6\\,7\\,8"
|
|
PARTICIPANT_IDS_TARGET = "9\\,10\\,11\\,12\\,13\\,14\\,15\\,16"
|
|
PARTICIPANT_IDS = "17\\,18\\,19\\,20\\,21\\,22\\,23\\,24"
|
|
SPEC_DEC_FAST_LOGITS = "1"
|
|
TOKENIZER_PATH = llama_v3_8b_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
CONTROL_ENGINE_DIR,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
DRAFT_ENGINE_PATH=DRAFT_ENGINE_DIR,
|
|
TARGET_ENGINE_PATH=TARGET_ENGINE_DIR,
|
|
PARTICIPANT_IDS_DRAFT=PARTICIPANT_IDS_DRAFT,
|
|
PARTICIPANT_IDS_TARGET=PARTICIPANT_IDS_TARGET,
|
|
PARTICIPANT_IDS=PARTICIPANT_IDS,
|
|
SPEC_DEC_FAST_LOGITS=SPEC_DEC_FAST_LOGITS,
|
|
)
|
|
|
|
# Launch Triton server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
model_names = "tensorrt_llm,tensorrt_llm_draft,tensorrt_llm_target"
|
|
check_call(
|
|
f"python3 {launch_server_py} --model_repo={new_model_repo} --tensorrt_llm_model_name {model_names} --multi-model --disable-spawn-processes --world_size=25",
|
|
shell=True)
|
|
check_server_ready(http_port="8000")
|
|
|
|
# Run Test
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME = "tensorrt_llm_draft"
|
|
TENSORRT_LLM_TARGET_MODEL_NAME = "tensorrt_llm_target"
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/speculative_decoding_test.py",
|
|
"--max-input-len=200",
|
|
f"--dataset={llm_backend_dataset_root}/mini_cnn_eval_spec_decoding.json",
|
|
"--url-target=0.0.0.0:8001",
|
|
"--url-draft=0.0.0.0:8001",
|
|
"--url-control=0.0.0.0:8001",
|
|
f"--draft-tensorrt-llm-model-name={TENSORRT_LLM_DRAFT_MODEL_NAME}",
|
|
f"--target-tensorrt-llm-model-name={TENSORRT_LLM_TARGET_MODEL_NAME}",
|
|
"--bls-speculative-tensorrt-llm-model-name=tensorrt_llm_bls",
|
|
"--execute-bls-speculative-decoding",
|
|
"--num-draft-tokens=5",
|
|
"--disable-output-comparison",
|
|
"--verbose",
|
|
]
|
|
|
|
if USE_DRAFT_LOGITS_VALUES == "True":
|
|
run_cmd += [
|
|
"--return-generation-logits",
|
|
"--use-draft-logits",
|
|
"--disable-output-comparison",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.skip_less_device(8)
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_gpt_175b_dummyWeights_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
tensorrt_llm_example_root,
|
|
gpt_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = prepare_gpt_175b_engine("ifb", tensorrt_llm_gpt_example_root,
|
|
tensorrt_llm_example_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=8 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={gpt_tokenizer_model_root}",
|
|
"--tokenizer-type=auto",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
def test_llava(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
tensorrt_llm_multimodal_example_root,
|
|
tensorrt_llm_llama_example_root,
|
|
llava_model_root,
|
|
llm_backend_multimodal_example_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_llava_engine(
|
|
tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root,
|
|
llava_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Prepare multimodal specific repo
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"ensemble")
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"multimodal_encoders")
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llava_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
|
|
# NOTE
|
|
# Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
|
|
check_call(
|
|
f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo} --exit_timeout=300",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{llm_backend_multimodal_example_root}/client.py",
|
|
"--model_type=llava",
|
|
f"--hf_model_dir={llava_model_root}",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
if E2E_MODEL_NAME == "tensorrt_llm_bls":
|
|
run_cmd += [
|
|
"--use_bls",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("FEATURE_NAME", ["test_basic", "test_video"])
|
|
def test_llava_onevision(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
FEATURE_NAME,
|
|
tensorrt_llm_multimodal_example_root,
|
|
tensorrt_llm_qwen_example_root,
|
|
llava_onevision_model_root,
|
|
llm_backend_all_models_root,
|
|
llm_backend_multimodal_example_root,
|
|
test_video_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_llava_onevision_engine(
|
|
tensorrt_llm_multimodal_example_root, tensorrt_llm_qwen_example_root,
|
|
llava_onevision_model_root, llm_backend_all_models_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Prepare multimodal specific repo
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"ensemble")
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"multimodal_encoders")
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = llava_onevision_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
|
|
# NOTE
|
|
# Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
|
|
check_call(
|
|
f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
if FEATURE_NAME == "test_basic":
|
|
keyword = "singapore"
|
|
run_cmd = [
|
|
f"{llm_backend_multimodal_example_root}/client.py",
|
|
"--model_type=llava_onevision",
|
|
"--end-id=151645",
|
|
"--pad-id=151643",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
keyword = "sing"
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
elif FEATURE_NAME == "test_video":
|
|
keyword = "robotic"
|
|
run_cmd = [
|
|
f"{llm_backend_multimodal_example_root}/client.py",
|
|
"--model_type=llava_onevision",
|
|
"--end-id=151645",
|
|
"--pad-id=151643",
|
|
"--text=What is in this video?",
|
|
f"--video={test_video_root}/video_test.mp4",
|
|
"--video_num_frames=8",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
output_result = venv_check_output(llm_backend_venv, run_cmd)
|
|
validate_by_keyword(output_result, keyword)
|
|
|
|
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"])
|
|
@pytest.mark.parametrize("CROSS_KV_CACHE_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("IMAGE_TYPE", ["URL", "BASE64"])
|
|
@pytest.mark.parametrize("ENCODER_INPUT_FEATURES_DTYPE", ["TYPE_BF16"])
|
|
def test_mllama(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
CROSS_KV_CACHE_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
IMAGE_TYPE,
|
|
ENCODER_INPUT_FEATURES_DTYPE,
|
|
tensorrt_llm_multimodal_example_root,
|
|
tensorrt_llm_mllama_example_root,
|
|
mllama_model_root,
|
|
llm_backend_root,
|
|
llm_backend_multimodal_example_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1":
|
|
pytest.skip("Skipping. mllama is not supported with V1.")
|
|
|
|
if BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip(
|
|
"Skipping. models with crossAttention not supported with max_utilization."
|
|
)
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_mllama_engine(
|
|
tensorrt_llm_multimodal_example_root, tensorrt_llm_mllama_example_root,
|
|
mllama_model_root, llm_backend_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Prepare multimodal specific repo
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"ensemble")
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"multimodal_encoders")
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = mllama_model_root
|
|
if CROSS_KV_CACHE_FRACTION == "":
|
|
CROSS_KV_CACHE_FRACTION = "0.5"
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
|
|
CROSS_KV_CACHE_FRACTION=CROSS_KV_CACHE_FRACTION,
|
|
ENCODER_INPUT_FEATURES_DTYPE=ENCODER_INPUT_FEATURES_DTYPE,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
# NOTE
|
|
# Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
|
|
check_call(
|
|
f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo} --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders",
|
|
shell=True)
|
|
check_server_ready()
|
|
|
|
# Run Test
|
|
if IMAGE_TYPE == 'URL':
|
|
IMAGE_URL = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png"
|
|
elif IMAGE_TYPE == 'BASE64':
|
|
IMAGE_URL = (
|
|
""
|
|
"2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/"
|
|
"2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/"
|
|
"wAARCAAKAAoDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/"
|
|
"8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoK"
|
|
"So0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztL"
|
|
"W2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQF"
|
|
"BgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8R"
|
|
"cYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqK"
|
|
"mqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwCK3trSL7TD5CqBmVP"
|
|
"tcUR2jYm2PlmHIUNjhgEJwdxFX1msWUNLG5kIyxF1FGCe/wAm75fp26VqzQxS6rEskSOrxKWDKCGJksgSfXIJH4mvPNTuJ4tWvI"
|
|
"45pERZ3VVViAAGOABW2GviN9NL6f1ff+uprVwLxEb05cr06KXTz2P/2Q==")
|
|
|
|
text_prompt = "<|image|>\nPlease elaborate what you see in the image?"
|
|
run_cmd = [
|
|
f"{llm_backend_multimodal_example_root}/client.py",
|
|
"--model_type=mllama",
|
|
f"--hf_model_dir={mllama_model_root}",
|
|
f"--text='{text_prompt}'",
|
|
f"--image={IMAGE_URL}",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
if E2E_MODEL_NAME == "tensorrt_llm_bls":
|
|
run_cmd += [
|
|
"--use_bls",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
payload_str = json.dumps({
|
|
"id": "42",
|
|
"text_input": f"{text_prompt}",
|
|
"image_url_input": f"{IMAGE_URL}",
|
|
"parameters": {
|
|
"max_tokens": 16,
|
|
"top_k": 1,
|
|
"top_p": 0,
|
|
"stream": (DECOUPLED_MODE == "True"),
|
|
"temperature": 0
|
|
}
|
|
})
|
|
curl_cmd = f"curl -m 10 -X POST localhost:8000/v2/models/{E2E_MODEL_NAME}/generate_stream -d '{payload_str}'"
|
|
check_call(curl_cmd, shell=True)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("VIRTUAL_TOKENS", ["True", "False"],
|
|
ids=["withVirtualTokens", "withoutVirtualTokens"])
|
|
@pytest.mark.parametrize("ENABLE_CONTEXT_FMHA_FP32_ACC", ["True", "False"])
|
|
def test_gpt_next_ptuning_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
VIRTUAL_TOKENS,
|
|
ENABLE_CONTEXT_FMHA_FP32_ACC,
|
|
inflight_batcher_llm_client_root,
|
|
gpt_tokenizer_model_root,
|
|
tensorrt_llm_example_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_next_ptuning_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH, output_model_dir = prepare_gpt_next_ptuning_engine(
|
|
"ifb", tensorrt_llm_gpt_example_root, gpt_next_ptuning_model_root)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
ENABLE_CONTEXT_FMHA_FP32_ACC=ENABLE_CONTEXT_FMHA_FP32_ACC,
|
|
)
|
|
# WAR for https://nvbugspro.nvidia.com/bug/4742149
|
|
gpu_name = query_gpu_name()
|
|
if "NVIDIA H20" == gpu_name:
|
|
check_call("pip3 install -U nvidia-cublas-cu12", shell=True)
|
|
|
|
# Generate reference output
|
|
run_py_path = os.path.join(tensorrt_llm_example_root, "run.py")
|
|
vocab_file = os.path.join(output_model_dir, "tokenizer.model")
|
|
# 1. Input with virtual tokens:
|
|
if VIRTUAL_TOKENS == "True":
|
|
prompt_table = os.path.join(tensorrt_llm_gpt_example_root,
|
|
"email_composition.npy")
|
|
input_tokens = os.path.join(tensorrt_llm_gpt_example_root, "input.csv")
|
|
run_cmd = [
|
|
f"{run_py_path}",
|
|
"--max_output_len=8",
|
|
f"--vocab_file={vocab_file}",
|
|
f"--prompt_table_path={prompt_table}",
|
|
f"--input_file={input_tokens}",
|
|
f"--engine_dir={ENGINE_PATH}",
|
|
f"--output_csv=output_w_prompt.csv",
|
|
"--no_add_special_tokens",
|
|
"--no-kv_cache_enable_block_reuse",
|
|
]
|
|
if ENABLE_CONTEXT_FMHA_FP32_ACC == "True":
|
|
run_cmd += [
|
|
"--enable_context_fmha_fp32_acc",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
# 2. Input w/o virtual tokens:
|
|
elif VIRTUAL_TOKENS == "False":
|
|
input_wo_prompt_csv = os.path.join(
|
|
llm_backend_venv.get_working_directory(), "input_wo_prompt.csv")
|
|
check_call(
|
|
f"echo \"25229,291,7379,251522,39854,5754,251514,315,32906,14297,398,261\" > {input_wo_prompt_csv}",
|
|
shell=True)
|
|
run_cmd = [
|
|
f"{run_py_path}",
|
|
"--max_output_len=8",
|
|
f"--vocab_file={vocab_file}",
|
|
f"--input_file={input_wo_prompt_csv}",
|
|
f"--engine_dir={ENGINE_PATH}",
|
|
f"--output_csv=output_wo_prompt.csv",
|
|
"--no_add_special_tokens",
|
|
]
|
|
if ENABLE_CONTEXT_FMHA_FP32_ACC == "True":
|
|
run_cmd += [
|
|
"--enable_context_fmha_fp32_acc",
|
|
]
|
|
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
|
|
# Run Test
|
|
if VIRTUAL_TOKENS == "True":
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--prompt-embedding-table={prompt_table}", "--prompt-task-id=0",
|
|
f"--input-tokens-csv={input_tokens}",
|
|
"--output-tokens-csv=output_w_prompt.csv", "--request-output-len=8",
|
|
"--check-output"
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
elif VIRTUAL_TOKENS == "False":
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--input-tokens-csv={input_wo_prompt_csv}",
|
|
"--output-tokens-csv=output_wo_prompt.csv",
|
|
"--request-output-len=8", "--check-output"
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("GPU_WEIGHTS_PERCENT", ["0.5", "1.0"])
|
|
def test_gpt_2b_lora_ifb(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
GPU_WEIGHTS_PERCENT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_example_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_2b_lora_model_root,
|
|
models_root,
|
|
llm_backend_venv,
|
|
):
|
|
if BATCHING_STRATEGY == "V1":
|
|
pytest.skip("Skipping. LoRA is not supported in V1.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
weight_streaming = float(GPU_WEIGHTS_PERCENT) < 1.0
|
|
ENGINE_PATH = prepare_gpt_2b_lora_engine("ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_2b_lora_model_root,
|
|
models_root, weight_streaming)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = os.path.join(models_root, "gpt-next",
|
|
"gpt-next-tokenizer-hf-v2")
|
|
modify_ib_config_pbtxt(new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
GPU_WEIGHTS_PERCENT=GPU_WEIGHTS_PERCENT)
|
|
|
|
# Generate reference output
|
|
run_py_path = os.path.join(tensorrt_llm_example_root, "run.py")
|
|
# Input with virtual tokens:
|
|
input_tokens = os.path.join(tensorrt_llm_gpt_example_root, "input.csv")
|
|
output_tokens = os.path.join(tensorrt_llm_gpt_example_root, "output.csv")
|
|
lora_path = os.path.join(tensorrt_llm_gpt_example_root,
|
|
"gpt-2b-lora-train-900")
|
|
lora_nemo_path = os.path.join(tensorrt_llm_gpt_example_root,
|
|
"gpt2b_lora-900.nemo")
|
|
run_cmd = [
|
|
f"{run_py_path}", "--max_output_len=8", f"--lora_dir={lora_nemo_path}",
|
|
"--lora_ckpt_source=nemo", "--lora_task_uids=0",
|
|
f"--input_file={input_tokens}", f"--output_csv={output_tokens}",
|
|
f"--engine_dir={ENGINE_PATH}", "--use_py_session",
|
|
f"--gpu_weights_percent={GPU_WEIGHTS_PERCENT}"
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
|
|
# Run Test
|
|
gen_cache_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--input-tokens-csv={input_tokens}",
|
|
f"--output-tokens-csv={output_tokens}",
|
|
"--request-output-len=8",
|
|
"--check-output",
|
|
f"--lora-path={lora_path}",
|
|
"--lora-task-id=12345",
|
|
]
|
|
venv_check_call(llm_backend_venv, gen_cache_cmd)
|
|
|
|
# Test GPU cache
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--input-tokens-csv={input_tokens}",
|
|
f"--output-tokens-csv={output_tokens}",
|
|
"--request-output-len=8",
|
|
"--check-output",
|
|
"--lora-task-id=12345",
|
|
]
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("TEST_TYPE", ["accuracy"])
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["True"])
|
|
@pytest.mark.parametrize("BACKEND", ["tensorrtllm", "python"])
|
|
@pytest.mark.parametrize("GUIDED_DECODING_BACKEND", ["xgrammar"])
|
|
def test_tiny_llama_1b_guided_decoding(
|
|
TEST_TYPE,
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
BACKEND,
|
|
GUIDED_DECODING_BACKEND,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_example_root,
|
|
tensorrt_llm_llama_example_root,
|
|
tiny_llama_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
|
|
# Build engine
|
|
ENGINE_PATH, XGRAMMAR_TOKENIZER_INFO_PATH = prepare_tiny_llama_1b_engine(
|
|
type=BACKEND,
|
|
tensorrt_llm_llama_example_root=tensorrt_llm_llama_example_root,
|
|
tiny_llama_model_root=tiny_llama_model_root,
|
|
tensorrt_llm_example_root=tensorrt_llm_example_root)
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = tiny_llama_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
BACKEND=BACKEND,
|
|
GUIDED_DECODING_BACKEND=GUIDED_DECODING_BACKEND,
|
|
XGRAMMAR_TOKENIZER_INFO_PATH=XGRAMMAR_TOKENIZER_INFO_PATH)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
if TEST_TYPE == "accuracy":
|
|
prompt = "What is the year after 2024? Answer:"
|
|
guide_type_lists = [
|
|
None, "json", "json_schema", "regex", "ebnf_grammar"
|
|
]
|
|
guide_lists = [
|
|
None, None,
|
|
'{"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"}',
|
|
r'\d+', 'root ::= [0-9]+'
|
|
]
|
|
keywords = ['Answer: 2026', '[2025]', '{"answer":2028}', '2025', '2025']
|
|
for guide_type, guide, keyword in zip(guide_type_lists, guide_lists,
|
|
keywords):
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/end_to_end_grpc_client.py",
|
|
f"--prompt={prompt}",
|
|
"--output-len=30",
|
|
"--exclude-input-in-output",
|
|
"--verbose",
|
|
]
|
|
|
|
if guide_type is not None:
|
|
run_cmd += [f"--guided-decoding-guide-type={guide_type}"]
|
|
|
|
if guide is not None:
|
|
run_cmd += [f"--guided-decoding-guide={guide}"]
|
|
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
validate_by_keyword(output, keyword)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["True"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.2"])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["True"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", ["top_k_top_p"])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("FEATURE_NAME", ["test_basic"])
|
|
def test_gpt_disaggregated_serving_bls(
|
|
E2E_MODEL_NAME,
|
|
FEATURE_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
llm_backend_venv,
|
|
monkeypatch,
|
|
):
|
|
# Enable disaggregated serving.
|
|
monkeypatch.setenv("TRTLLM_USE_MPI_KVCACHE", "1")
|
|
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if BATCHING_STRATEGY == "V1" and FEATURE_NAME == "test_embedding_bias":
|
|
pytest.skip("Skipping. V1 doesn't support embedding_bias tensor yet.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH = prepare_gpt_350m_engine(
|
|
"ifb",
|
|
tensorrt_llm_gpt_example_root,
|
|
gpt_tokenizer_model_root,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
prepare_disaggregated_serving_model_repo(llm_backend_repo_root,
|
|
new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = gpt_tokenizer_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm",
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME="",
|
|
)
|
|
modify_disaggregated_serving_config_pbtxt(llm_backend_repo_root,
|
|
new_model_repo)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
feature_name = f"{FEATURE_NAME}"
|
|
tokenizer_dir = f"{gpt_tokenizer_model_root}"
|
|
|
|
if DECOUPLED_MODE == "False":
|
|
run_cpp_backend_tests(feature_name, llm_backend_venv,
|
|
inflight_batcher_llm_client_root, tokenizer_dir)
|
|
else:
|
|
test_model_name = ""
|
|
if ACCUMULATE_TOKEN == "True" and E2E_MODEL_NAME == "tensorrt_llm_bls":
|
|
test_model_name = "gpt_350m"
|
|
|
|
run_cpp_streaming_backend_tests(feature_name,
|
|
llm_backend_venv,
|
|
inflight_batcher_llm_client_root,
|
|
tokenizer_dir,
|
|
model_name=test_model_name,
|
|
e2e_model=E2E_MODEL_NAME)
|
|
|
|
|
|
# Define model configurations as a dictionary
|
|
MODEL_CONFIGS = {
|
|
"llama_v2_7b": {
|
|
"example_root_fixture": "tensorrt_llm_llama_example_root",
|
|
"tokenizer_path_fixture": "llama_v2_tokenizer_model_root",
|
|
"prepare_engine_fn": prepare_llama_v2_7b_engine
|
|
},
|
|
"gptj_6b": {
|
|
"example_root_fixture": "tensorrt_llm_gptj_example_root",
|
|
"tokenizer_path_fixture": "gptj_tokenizer_model_root",
|
|
"prepare_engine_fn": prepare_gptj_6b_engine
|
|
},
|
|
}
|
|
|
|
# Latency thresholds for different GPUs and models
|
|
LATENCY_THRESHOLDS = {
|
|
"NVIDIA H100 PCIe": {
|
|
"gptj_6b": 1300, # Threshold in milliseconds
|
|
"llama_v2_7b": 1200,
|
|
# Can add more models here with their thresholds
|
|
},
|
|
# Can add more GPU types here
|
|
}
|
|
|
|
|
|
# Fixture to handle model configuration
|
|
@pytest.fixture
|
|
def model_setup(request):
|
|
model_name = request.param
|
|
config = MODEL_CONFIGS[model_name]
|
|
|
|
# Get the actual fixture values
|
|
example_root = request.getfixturevalue(config["example_root_fixture"])
|
|
tokenizer_path = request.getfixturevalue(config["tokenizer_path_fixture"])
|
|
|
|
return {
|
|
"name": model_name,
|
|
"example_root": example_root,
|
|
"tokenizer_path": tokenizer_path,
|
|
"prepare_engine_fn": config["prepare_engine_fn"]
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", ["4096"])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["max_utilization"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["False"],
|
|
ids=["disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("model_setup",
|
|
list(MODEL_CONFIGS.keys()),
|
|
indirect=True)
|
|
def test_benchmark_core_model(
|
|
model_setup,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
llm_backend_inflight_batcher_llm_root,
|
|
llm_backend_dataset_root,
|
|
llm_backend_venv,
|
|
):
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build Engine
|
|
ENGINE_PATH = model_setup["prepare_engine_fn"](
|
|
"ifb", model_setup["example_root"], model_setup["tokenizer_path"])
|
|
TOKENIZER_PATH = model_setup["tokenizer_path"]
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --force --world_size 1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
# Run Test
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/benchmark_core_model.py",
|
|
"--concurrency=8",
|
|
"--max-input-len=300",
|
|
"dataset",
|
|
f"--dataset={llm_backend_dataset_root}/mini_cnn_eval.json",
|
|
f"--tokenizer-dir={TOKENIZER_PATH}",
|
|
]
|
|
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
print(output)
|
|
latency = retrieve_latency_value(output)
|
|
print(f"Extracted latency: {latency} ms")
|
|
|
|
gpu_full_name = get_gpu_full_name()
|
|
if gpu_full_name in LATENCY_THRESHOLDS:
|
|
latency_threshold = LATENCY_THRESHOLDS["NVIDIA H100 PCIe"][
|
|
model_setup["name"]]
|
|
assert latency < latency_threshold, f"Latency {latency} ms is greater than the threshold {latency_threshold} ms"
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["tensorrt_llm"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", [False, True],
|
|
ids=["disableDecoupleMode", "enableDecoupleMode"])
|
|
# TODO: [JIRA-4496] Add batch support in llmapi backend and add tests here.
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["0"])
|
|
@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1", "4"])
|
|
def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
|
|
TENSOR_PARALLEL_SIZE,
|
|
llm_backend_inflight_batcher_llm_root, llm_backend_venv,
|
|
llm_backend_dataset_root, tiny_llama_model_root):
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
|
|
if torch.cuda.device_count() < int(TENSOR_PARALLEL_SIZE):
|
|
pytest.skip("Skipping. Not enough GPUs.")
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_llmapi_model_repo(llm_backend_repo_root, new_model_repo)
|
|
model_config_path = os.path.join(new_model_repo, "tensorrt_llm", "1",
|
|
"model.yaml")
|
|
with open(model_config_path, "r") as f:
|
|
model_config = yaml.safe_load(f)
|
|
model_config["triton_config"]["decoupled"] = DECOUPLED_MODE
|
|
model_config["triton_config"]["max_batch_size"] = int(TRITON_MAX_BATCH_SIZE)
|
|
model_config["tensor_parallel_size"] = int(TENSOR_PARALLEL_SIZE)
|
|
model_config["kv_cache_config"] = {"free_gpu_memory_fraction": 0.8}
|
|
model_config["model"] = tiny_llama_model_root
|
|
with open(model_config_path, "w") as f:
|
|
yaml.dump(model_config, f)
|
|
|
|
with open(model_config_path, "r") as f:
|
|
model_config = yaml.safe_load(f)
|
|
print_info(f"DEBUG:: model_config: {model_config}")
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
cmd = f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo}"
|
|
if TENSOR_PARALLEL_SIZE == "4":
|
|
cmd += " --trtllm_llmapi_launch"
|
|
cmd += " --oversubscribe"
|
|
else:
|
|
cmd += " --no-mpi"
|
|
print_info(f"DEBUG:: launch_server with args: {cmd}")
|
|
check_call(cmd, shell=True)
|
|
check_server_ready()
|
|
|
|
# Speed up the test by running multiple tests with different configurations sharing the same triton server.
|
|
protocols = ["http", "grpc"]
|
|
STREAMS = [False, True]
|
|
if DECOUPLED_MODE:
|
|
protocols = ['grpc'] # Triton only support grpc in decoupled mode
|
|
STREAMS = [True] # Triton only support non-streaming in decoupled mode
|
|
else:
|
|
STREAMS = [False
|
|
] # Triton only support non-streaming in non-decoupled mode
|
|
|
|
for protocol in protocols:
|
|
for STREAM in STREAMS:
|
|
print_info(
|
|
f"DEBUG:: protocol: {protocol}, STREAM: {STREAM}, DECOUPLED_MODE: {DECOUPLED_MODE}"
|
|
)
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/end_to_end_test.py",
|
|
f"--protocol={protocol}",
|
|
f"--test-llmapi",
|
|
f"--model-name={E2E_MODEL_NAME}",
|
|
f"--max-input-len=192",
|
|
f"--dataset={os.path.join(llm_backend_dataset_root, 'mini_cnn_eval.json')}",
|
|
]
|
|
if STREAM:
|
|
run_cmd += [
|
|
"--streaming",
|
|
]
|
|
|
|
print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
run_cmd = [
|
|
f"{llm_backend_inflight_batcher_llm_root}/benchmark_core_model.py",
|
|
f"--max-input-len=300",
|
|
f"--tensorrt-llm-model-name={E2E_MODEL_NAME}",
|
|
f"--protocol={protocol}",
|
|
f"--test-llmapi",
|
|
'dataset',
|
|
f"--dataset={os.path.join(llm_backend_dataset_root, 'mini_cnn_eval.json')}",
|
|
f"--tokenizer-dir={tiny_llama_model_root}",
|
|
]
|
|
|
|
print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))
|
|
venv_check_call(llm_backend_venv, run_cmd)
|
|
|
|
# Test request cancellation with stop request
|
|
run_cmd = [
|
|
f"{llm_backend_repo_root}/tools/llmapi_client.py",
|
|
"--request-output-len=200", '--stop-after-ms=25'
|
|
]
|
|
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
assert 'Request is cancelled' in output
|
|
|
|
# Test request cancellation with request cancel
|
|
run_cmd += ['--stop-via-request-cancel']
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
assert 'Request is cancelled' in output
|
|
|
|
# Test request cancellation for non-existing request and completed request
|
|
run_cmd = [
|
|
f"{llm_backend_repo_root}/tools/tests/test_llmapi_cancel.py"
|
|
]
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
|
|
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("TOKEN_COUNT_TEST",
|
|
["input_only", "output_only", "both"])
|
|
@pytest.mark.parametrize("BACKEND", ["tensorrtllm", "python"])
|
|
def test_tiny_llama_ifb_token_counts(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
TOKEN_COUNT_TEST,
|
|
BACKEND,
|
|
inflight_batcher_llm_client_root,
|
|
tensorrt_llm_llama_example_root,
|
|
tiny_llama_model_root,
|
|
llm_backend_venv,
|
|
):
|
|
"""Test that the TRT-LLM inflight batcher backend can return input and output token counts."""
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
|
|
pytest.skip("Skipping.")
|
|
|
|
llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
|
|
# Build engine
|
|
ENGINE_PATH, _ = prepare_tiny_llama_1b_engine(
|
|
type="ifb",
|
|
tensorrt_llm_llama_example_root=tensorrt_llm_llama_example_root,
|
|
tiny_llama_model_root=tiny_llama_model_root,
|
|
tensorrt_llm_example_root=None,
|
|
)
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = tiny_llama_model_root
|
|
modify_ib_config_pbtxt(new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm",
|
|
TENSORRT_LLM_DRAFT_MODEL_NAME="",
|
|
BACKEND=BACKEND)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
|
|
# Test token count functionality based on the test type
|
|
tokenizer_dir = f"{tiny_llama_model_root}"
|
|
|
|
# Prepare different test commands based on token count test type
|
|
if TOKEN_COUNT_TEST == "input_only":
|
|
test_args = ["--return-num-input-tokens"]
|
|
elif TOKEN_COUNT_TEST == "output_only":
|
|
test_args = ["--return-num-output-tokens"]
|
|
elif TOKEN_COUNT_TEST == "both":
|
|
test_args = ["--return-num-input-tokens", "--return-num-output-tokens"]
|
|
|
|
if DECOUPLED_MODE == "False":
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={tokenizer_dir}",
|
|
"--tokenizer-type=auto",
|
|
"--request-output-len=20",
|
|
] + test_args
|
|
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
else:
|
|
run_cmd = [
|
|
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
|
|
f"--tokenizer-dir={tokenizer_dir}",
|
|
"--tokenizer-type=auto",
|
|
"--request-output-len=20",
|
|
"--streaming",
|
|
] + test_args
|
|
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
|
|
print(output)
|
|
if TOKEN_COUNT_TEST == "input_only":
|
|
assert "Input token count: [[13]]" in output
|
|
elif TOKEN_COUNT_TEST == "output_only":
|
|
if DECOUPLED_MODE == "False":
|
|
assert "Output token count: [[33]]" in output
|
|
else:
|
|
assert "Output token count: [[1]]" in output and not "Output token count: [[20]]" in output
|
|
elif TOKEN_COUNT_TEST == "both":
|
|
assert "Input token count: [[13]]" in output
|
|
if DECOUPLED_MODE == "False":
|
|
assert "Output token count: [[33]]" in output
|
|
else:
|
|
assert "Output token count: [[1]]" in output and not "Output token count: [[20]]" in output
|
|
print_info(
|
|
f"Successfully tested token count functionality for {TOKEN_COUNT_TEST} mode"
|
|
)
|
|
|
|
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
|
|
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
|
|
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
|
|
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
|
|
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
|
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
|
["max_utilization", "guaranteed_no_evict"])
|
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"])
|
|
@pytest.mark.parametrize("CROSS_KV_CACHE_FRACTION", [""])
|
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
|
ids=["disableTrtOverlap"])
|
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
|
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["1"])
|
|
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
|
|
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
|
|
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
|
|
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
|
|
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
|
|
@pytest.mark.parametrize("DECODING_MODE", [""])
|
|
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
|
|
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
|
|
@pytest.mark.parametrize("PROMPT_EMBEDDING_TABLE_DTYPE",
|
|
["TYPE_BF16"]) # allow override later
|
|
@pytest.mark.parametrize("ENCODER_INPUT_FEATURES_DTYPE",
|
|
["TYPE_FP16"]) # pixtral uses fp16 vision by default
|
|
def test_mistral_small_3_1_24b_pixtral(
|
|
E2E_MODEL_NAME,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
CROSS_KV_CACHE_FRACTION,
|
|
ENABLE_TRT_OVERLAP,
|
|
BATCHING_STRATEGY,
|
|
DECOUPLED_MODE,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
PROMPT_EMBEDDING_TABLE_DTYPE,
|
|
ENCODER_INPUT_FEATURES_DTYPE,
|
|
tensorrt_llm_multimodal_example_root,
|
|
tensorrt_llm_llama_example_root,
|
|
mistral_small_3_1_24b_model_root,
|
|
llm_backend_multimodal_example_root,
|
|
llm_backend_venv,
|
|
llm_root,
|
|
):
|
|
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
|
|
pytest.skip("Skipping. V1 doesn't support max_utilization.")
|
|
|
|
llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
|
|
|
|
# Build Engines (LLM + vision)
|
|
ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_mistral3_pixtral_engine(
|
|
tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root,
|
|
mistral_small_3_1_24b_model_root)
|
|
|
|
# Prepare model repo
|
|
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
|
|
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
|
|
|
|
# Prepare multimodal specific repo
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"ensemble")
|
|
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
|
|
"multimodal_encoders")
|
|
|
|
# Modify config.pbtxt
|
|
TOKENIZER_PATH = mistral_small_3_1_24b_model_root
|
|
modify_ib_config_pbtxt(
|
|
new_model_repo,
|
|
ENGINE_PATH,
|
|
TOKENIZER_PATH,
|
|
llm_backend_repo_root,
|
|
DECOUPLED_MODE,
|
|
MAX_TOKENS_IN_KV_CACHE,
|
|
MAX_ATTENTION_WINDOW_SIZE,
|
|
BATCH_SCHEDULER_POLICY,
|
|
BATCHING_STRATEGY,
|
|
KV_CACHE_FREE_GPU_MEM_FRACTION,
|
|
EXCLUDE_INPUT_IN_OUTPUT,
|
|
ENABLE_TRT_OVERLAP,
|
|
TRITON_MAX_BATCH_SIZE,
|
|
MAX_QUEUE_DELAY_MICROSECONDS,
|
|
MAX_BEAM_WIDTH,
|
|
ENABLE_KV_CACHE_REUSE,
|
|
NORMALIZE_LOG_PROBS,
|
|
ENABLE_CHUNKED_CONTEXT,
|
|
GPU_DEVICE_IDS,
|
|
DECODING_MODE,
|
|
PREPROCESSING_INSTANCE_COUNT,
|
|
POSTPROCESSING_INSTANCE_COUNT,
|
|
ACCUMULATE_TOKEN,
|
|
BLS_INSTANCE_COUNT,
|
|
MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
|
|
ENCODER_INPUT_FEATURES_DTYPE=ENCODER_INPUT_FEATURES_DTYPE,
|
|
PROMPT_EMBEDDING_TABLE_DTYPE=PROMPT_EMBEDDING_TABLE_DTYPE,
|
|
)
|
|
|
|
# Launch Triton Server
|
|
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
|
|
"launch_triton_server.py")
|
|
check_call(
|
|
f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
|
|
shell=True)
|
|
check_server_ready()
|
|
|
|
image_merlion = os.path.join(
|
|
llm_root,
|
|
"tests/integration/test_input_files/merlion.png",
|
|
)
|
|
image_football = os.path.join(
|
|
llm_root,
|
|
"tests/integration/test_input_files/pexels-franco-monsalvo-252430633-32285228.jpg",
|
|
)
|
|
image_hockey = os.path.join(
|
|
llm_root,
|
|
"tests/integration/test_input_files/pexels-ron-lach-8975010.jpg",
|
|
)
|
|
image_basketball = os.path.join(
|
|
llm_root,
|
|
"tests/integration/test_input_files/pexels-maxim-shklyaev-1511525-2914194.jpg",
|
|
)
|
|
|
|
test_cases = [
|
|
{
|
|
"text": "What is the capital of England?",
|
|
"image": "",
|
|
"match": re.compile("london", re.IGNORECASE)
|
|
},
|
|
{
|
|
"text": "In as few words as possible, what city is this?",
|
|
"image": image_merlion,
|
|
"match": re.compile("singapore", re.IGNORECASE)
|
|
},
|
|
{
|
|
"text":
|
|
"In as few words as possible, what sports are depicted in the images?",
|
|
"image":
|
|
",".join([image_football, image_hockey]),
|
|
"match":
|
|
re.compile("(football|soccer).*hockey", re.IGNORECASE | re.DOTALL)
|
|
},
|
|
{
|
|
"text":
|
|
"In as few words as possible, what sports are depicted in the images?",
|
|
"image":
|
|
",".join([image_football, image_hockey, image_basketball]),
|
|
"match":
|
|
re.compile("(football|soccer).*hockey.*basket",
|
|
re.IGNORECASE | re.DOTALL)
|
|
},
|
|
]
|
|
|
|
for test_case in test_cases:
|
|
TEXT = test_case["text"]
|
|
IMAGE = test_case["image"]
|
|
MATCH = test_case["match"]
|
|
|
|
# Run Test: use multimodal client; set model_type to pixtral
|
|
run_cmd = [
|
|
f"{llm_backend_multimodal_example_root}/client.py",
|
|
"--model_type=pixtral",
|
|
f"--text={TEXT}",
|
|
f"--image={IMAGE}",
|
|
"--request-output-len=128",
|
|
"--end-id=2",
|
|
]
|
|
if DECOUPLED_MODE == "True":
|
|
run_cmd += ["--streaming"]
|
|
|
|
if E2E_MODEL_NAME == "tensorrt_llm_bls":
|
|
run_cmd += ["--use_bls"]
|
|
|
|
output = venv_check_output(llm_backend_venv, run_cmd)
|
|
|
|
assert MATCH.search(
|
|
output), f"Test failed for input: {TEXT=}, {IMAGE=}, {output=}"
|