TensorRT-LLMs/examples/high-level-api/run_examples.py
2024-03-19 17:36:42 +08:00

45 lines
1.4 KiB
Python

import os
import subprocess
import sys
PROMPT = "Tell a story"
LLAMA_MODEL_DIR = sys.argv[1]
TMP_ENGINE_DIR = sys.argv[2] if len(sys.argv) > 2 else "./tllm.engine.example"
EXAMPLES_ROOT = sys.argv[3] if len(sys.argv) > 3 else ""
LLM_EXAMPLES = os.path.join(EXAMPLES_ROOT, 'llm_examples.py')
run_cmd = [
sys.executable, LLM_EXAMPLES, "--task=run_llm_from_huggingface_model",
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
f"--dump_engine_dir={TMP_ENGINE_DIR}"
]
subprocess.run(run_cmd, check=True)
# TP enabled
run_cmd = [
sys.executable, LLM_EXAMPLES, "--task=run_llm_from_huggingface_model",
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}", "--tp_size=2"
]
subprocess.run(run_cmd, check=True)
run_cmd = [
sys.executable, LLM_EXAMPLES, "--task=run_llm_from_tllm_engine",
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
f"--dump_engine_dir={TMP_ENGINE_DIR}"
]
subprocess.run(run_cmd, check=True)
run_cmd = [
sys.executable, LLM_EXAMPLES, "--task=run_llm_generate_async_example",
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}"
]
subprocess.run(run_cmd, check=True)
# Both TP and streaming enabled
run_cmd = [
sys.executable, LLM_EXAMPLES, "--task=run_llm_generate_async_example",
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}", "--streaming",
"--tp_size=2"
]
subprocess.run(run_cmd, check=True)