mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-23 20:23:08 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Bhuvanesh Sridharan <bhuvan.sridharan@gmail.com> Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com> Co-authored-by: Eddie-Wang1120 <wangjinheng1120@163.com> Co-authored-by: meghagarwal <16129366+megha95@users.noreply.github.com>
46 lines
1.4 KiB
Python
46 lines
1.4 KiB
Python
#!/usr/bin/env python
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
PROMPT = "Tell a story"
|
|
LLAMA_MODEL_DIR = sys.argv[1]
|
|
TMP_ENGINE_DIR = sys.argv[2] if len(sys.argv) > 2 else "./tllm.engine.example"
|
|
EXAMPLES_ROOT = sys.argv[3] if len(sys.argv) > 3 else ""
|
|
LLM_EXAMPLES = os.path.join(EXAMPLES_ROOT, 'llm_examples.py')
|
|
|
|
run_cmd = [
|
|
sys.executable, LLM_EXAMPLES, "--task=run_llm_from_huggingface_model",
|
|
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
|
|
f"--dump_engine_dir={TMP_ENGINE_DIR}"
|
|
]
|
|
subprocess.run(run_cmd, check=True)
|
|
|
|
# TP enabled
|
|
run_cmd = [
|
|
sys.executable, LLM_EXAMPLES, "--task=run_llm_from_huggingface_model",
|
|
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}", "--tp_size=2"
|
|
]
|
|
subprocess.run(run_cmd, check=True)
|
|
|
|
run_cmd = [
|
|
sys.executable, LLM_EXAMPLES, "--task=run_llm_from_tllm_engine",
|
|
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}",
|
|
f"--dump_engine_dir={TMP_ENGINE_DIR}"
|
|
]
|
|
subprocess.run(run_cmd, check=True)
|
|
|
|
run_cmd = [
|
|
sys.executable, LLM_EXAMPLES, "--task=run_llm_generate_async_example",
|
|
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}"
|
|
]
|
|
subprocess.run(run_cmd, check=True)
|
|
|
|
# Both TP and streaming enabled
|
|
run_cmd = [
|
|
sys.executable, LLM_EXAMPLES, "--task=run_llm_generate_async_example",
|
|
f"--prompt={PROMPT}", f"--hf_model_dir={LLAMA_MODEL_DIR}", "--streaming",
|
|
"--tp_size=2"
|
|
]
|
|
subprocess.run(run_cmd, check=True)
|