mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
move the reset models into examples/models/core directory (#3555)
* move rest models to examples/models/core directory Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * update multimodal readme Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix example path Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix cpp test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix tensorrt test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
c35d2a7532
commit
d51ae53940
@ -31,7 +31,7 @@ def build_engine(base_model_dir: _pl.Path, eagle_model_dir: _pl.Path,
|
||||
engine_dir: _pl.Path, build_base_model: bool, *args):
|
||||
|
||||
if build_base_model:
|
||||
checkpoint_path = "examples/llama/convert_checkpoint.py"
|
||||
checkpoint_path = "examples/models/core/llama/convert_checkpoint.py"
|
||||
else:
|
||||
checkpoint_path = "examples/eagle/convert_checkpoint.py"
|
||||
|
||||
|
||||
@ -119,7 +119,7 @@ class Convert(RunCMDMixin):
|
||||
def command(self):
|
||||
args = self.args
|
||||
return [
|
||||
f'python examples/enc_dec/convert_checkpoint.py',
|
||||
f'python examples/models/core/enc_dec/convert_checkpoint.py',
|
||||
f'--model_type {args.model_type}',
|
||||
f'--model_dir {args.hf_models_dir}',
|
||||
f'--output_dir {args.trt_models_dir}',
|
||||
|
||||
@ -37,7 +37,7 @@ def convert_ckpt(model_dir: str,
|
||||
world_size: int = 1,
|
||||
dtype: str = 'float16'):
|
||||
convert_cmd = [
|
||||
sys.executable, "examples/gpt/convert_checkpoint.py",
|
||||
sys.executable, "examples/models/core/gpt/convert_checkpoint.py",
|
||||
f"--model_dir={model_dir}", f"--output_dir={output_dir}",
|
||||
f"--dtype={dtype}", f"--tp_size={world_size}"
|
||||
] + list(args)
|
||||
|
||||
@ -32,11 +32,12 @@ def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, convert_extra_args,
|
||||
|
||||
ckpt_dir = engine_dir / 'ckpt'
|
||||
|
||||
convert_cmd = [_sys.executable, "examples/llama/convert_checkpoint.py"
|
||||
] + ([f'--model_dir={weight_dir}'] if weight_dir else []) + [
|
||||
f'--output_dir={ckpt_dir}',
|
||||
'--dtype=float16',
|
||||
] + convert_extra_args
|
||||
convert_cmd = [
|
||||
_sys.executable, "examples/models/core/llama/convert_checkpoint.py"
|
||||
] + ([f'--model_dir={weight_dir}'] if weight_dir else []) + [
|
||||
f'--output_dir={ckpt_dir}',
|
||||
'--dtype=float16',
|
||||
] + convert_extra_args
|
||||
|
||||
run_command(convert_cmd)
|
||||
|
||||
|
||||
@ -31,12 +31,13 @@ import tensorrt_llm.bindings as _tb
|
||||
|
||||
def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
|
||||
*args):
|
||||
convert_args = [_sys.executable, "examples/mamba/convert_checkpoint.py"] + (
|
||||
['--model_dir', str(weight_dir)] if weight_dir else []) + [
|
||||
'--output_dir',
|
||||
str(ckpt_dir),
|
||||
'--dtype=float16',
|
||||
]
|
||||
convert_args = [
|
||||
_sys.executable, "examples/models/core/mamba/convert_checkpoint.py"
|
||||
] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
|
||||
'--output_dir',
|
||||
str(ckpt_dir),
|
||||
'--dtype=float16',
|
||||
]
|
||||
run_command(convert_args)
|
||||
build_args = ["trtllm-build"] + ['--checkpoint_dir',
|
||||
str(ckpt_dir)] + [
|
||||
|
||||
@ -32,7 +32,8 @@ import tensorrt_llm.bindings as _tb
|
||||
def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
|
||||
*args):
|
||||
convert_args = [
|
||||
_sys.executable, "examples/recurrentgemma/convert_checkpoint.py"
|
||||
_sys.executable,
|
||||
"examples/models/core/recurrentgemma/convert_checkpoint.py"
|
||||
] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
|
||||
'--output_dir',
|
||||
str(ckpt_dir),
|
||||
|
||||
@ -11,7 +11,7 @@ class Run(RunCMDMixin):
|
||||
for beam in args.beams_tuple:
|
||||
ret.append((
|
||||
mpi_run,
|
||||
f'python3 examples/enc_dec/run.py --engine_dir {args.engines_dir}',
|
||||
f'python3 examples/models/core/enc_dec/run.py --engine_dir {args.engines_dir}',
|
||||
f'--engine_name {args.ckpt}',
|
||||
f'--model_name "{args.hf_models_dir}"',
|
||||
f'--max_new_tokens={args.max_new_tokens}',
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
{
|
||||
"builder_config": {
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 512,
|
||||
"name": "bert",
|
||||
"precision": "float16",
|
||||
"tensor_parallel": 1,
|
||||
"use_refit": false
|
||||
},
|
||||
"plugin_config": {
|
||||
"bert_attention_plugin": "float16",
|
||||
"context_fmha_enabled": true,
|
||||
"gemm_plugin": "float16",
|
||||
"gpt_attention_plugin": false,
|
||||
"identity_plugin": false,
|
||||
"layernorm_plugin": false,
|
||||
"layernorm_quantization_plugin": false,
|
||||
"nccl_plugin": false,
|
||||
"smooth_quant_gemm_plugin": false,
|
||||
"weight_only_quant_matmul_plugin": false
|
||||
}
|
||||
}
|
||||
@ -1,22 +0,0 @@
|
||||
{
|
||||
"builder_config": {
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 512,
|
||||
"name": "bert",
|
||||
"precision": "float16",
|
||||
"tensor_parallel": 1,
|
||||
"use_refit": false
|
||||
},
|
||||
"plugin_config": {
|
||||
"bert_attention_plugin": "float16",
|
||||
"context_fmha_enabled": true,
|
||||
"gemm_plugin": "float16",
|
||||
"gpt_attention_plugin": false,
|
||||
"identity_plugin": false,
|
||||
"layernorm_plugin": false,
|
||||
"layernorm_quantization_plugin": false,
|
||||
"nccl_plugin": false,
|
||||
"smooth_quant_gemm_plugin": false,
|
||||
"weight_only_quant_matmul_plugin": false
|
||||
}
|
||||
}
|
||||
@ -1,22 +0,0 @@
|
||||
{
|
||||
"builder_config": {
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 512,
|
||||
"name": "bert",
|
||||
"precision": "float16",
|
||||
"tensor_parallel": 1,
|
||||
"use_refit": false
|
||||
},
|
||||
"plugin_config": {
|
||||
"bert_attention_plugin": false,
|
||||
"context_fmha_enabled": false,
|
||||
"gemm_plugin": false,
|
||||
"gpt_attention_plugin": false,
|
||||
"identity_plugin": false,
|
||||
"layernorm_plugin": false,
|
||||
"layernorm_quantization_plugin": false,
|
||||
"nccl_plugin": false,
|
||||
"smooth_quant_gemm_plugin": false,
|
||||
"weight_only_quant_matmul_plugin": false
|
||||
}
|
||||
}
|
||||
@ -1,22 +0,0 @@
|
||||
{
|
||||
"builder_config": {
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 512,
|
||||
"name": "bert",
|
||||
"precision": "float16",
|
||||
"tensor_parallel": 1,
|
||||
"use_refit": false
|
||||
},
|
||||
"plugin_config": {
|
||||
"bert_attention_plugin": "float16",
|
||||
"context_fmha_enabled": true,
|
||||
"gemm_plugin": "float16",
|
||||
"gpt_attention_plugin": false,
|
||||
"identity_plugin": false,
|
||||
"layernorm_plugin": false,
|
||||
"layernorm_quantization_plugin": false,
|
||||
"nccl_plugin": false,
|
||||
"smooth_quant_gemm_plugin": false,
|
||||
"weight_only_quant_matmul_plugin": false
|
||||
}
|
||||
}
|
||||
@ -25,7 +25,7 @@ We provide two styles of running DTM now: using TensorRT-LLM-BLS in Triton Infer
|
||||
+ `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
|
||||
|
||||
```bash
|
||||
cd examples/llama
|
||||
cd examples/models/core/llama
|
||||
export DRAFT_CKPT_PATH=/workspace/ckpt-draft
|
||||
export TARGET_CKPT_PATH=/workspace/ckpt-target
|
||||
export DRAFT_ENGINE_PATH=/workspace/engine-draft
|
||||
|
||||
@ -4,7 +4,7 @@ This document explains how to build the BERT family, specifically [BERT](https:/
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM BERT family implementation can be found in [`tensorrt_llm/models/bert/model.py`](../../tensorrt_llm/models/bert/model.py).
|
||||
The TensorRT-LLM BERT family implementation can be found in [`tensorrt_llm/models/bert/model.py`](../../../../tensorrt_llm/models/bert/model.py).
|
||||
The TensorRT-LLM BERT family example code is located in [`examples/bert`](./). There are two main files in that folder:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the BERT model into tensorrt-llm checkpoint format.
|
||||
@ -18,15 +18,15 @@ This document explains how to build the [C4AI Command-R](https://huggingface.co/
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../tensorrt_llm/models/commandr/model.py).
|
||||
The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../../../tensorrt_llm/models/commandr/model.py).
|
||||
The TensorRT-LLM Command-R example code is located in [`examples/commandr`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
|
||||
@ -122,23 +122,23 @@ If the engines are built successfully, you will see output like (Command-R as th
|
||||
|
||||
```bash
|
||||
# Run the default engine of Command-R on single GPU.
|
||||
python3 ../run.py --max_output_len 50 \
|
||||
python3 ../../../run.py --max_output_len 50 \
|
||||
--tokenizer_dir command_r_v01 \
|
||||
--engine_dir trt_engines/command_r_v01/fp16/1-gpu
|
||||
|
||||
# Run the default engine of Command-R on single GPU, using streaming output.
|
||||
python3 ../run.py --max_output_len 50 \
|
||||
python3 ../../../run.py --max_output_len 50 \
|
||||
--tokenizer_dir command_r_v01 \
|
||||
--engine_dir trt_engines/command_r_v01/fp16/1-gpu \
|
||||
--streaming
|
||||
|
||||
# Run the default engine of Aya-23-8B on single GPU.
|
||||
python3 ../run.py --max_output_len 50 \
|
||||
python3 ../../../run.py --max_output_len 50 \
|
||||
--tokenizer_dir aya_23_8B \
|
||||
--engine_dir trt_engines/aya_23_8B/fp16/1-gpu
|
||||
|
||||
# Run the default engine of Aya-23-35B on single GPU.
|
||||
python3 ../run.py --max_output_len 50 \
|
||||
python3 ../../../run.py --max_output_len 50 \
|
||||
--tokenizer_dir aya_23_35B \
|
||||
--engine_dir trt_engines/aya_23_35B/fp16/1-gpu
|
||||
```
|
||||
@ -148,7 +148,7 @@ python3 ../run.py --max_output_len 50 \
|
||||
```bash
|
||||
# Run the Tensor Parallel 4 engine of Command-R+ on 4 GPUs.
|
||||
mpirun -n 4 \
|
||||
python ../run.py --max_output_len 50 \
|
||||
python ../../../run.py --max_output_len 50 \
|
||||
--tokenizer_dir command_r_plus \
|
||||
--engine_dir trt_engines/command_r_plus/fp16/4-gpu
|
||||
```
|
||||
@ -165,7 +165,7 @@ Output [Text 0 Beam 0]: " chef in Paris and worked in the kitchens of the French
|
||||
|
||||
```bash
|
||||
# Run the summarization of Command-R task.
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir command_r_v01 \
|
||||
--engine_dir trt_engines/command_r_v01/fp16/1-gpu
|
||||
```
|
||||
@ -201,7 +201,7 @@ trtllm-build --checkpoint_dir trt_ckpt/command_r_v01/int8_wo/1-gpu \
|
||||
--output_dir trt_engines/command_r_v01/int8_wo/1-gpu
|
||||
|
||||
# Run inference.
|
||||
python3 ../run.py --max_output_len 50 \
|
||||
python3 ../../../run.py --max_output_len 50 \
|
||||
--tokenizer_dir command_r_v01 \
|
||||
--engine_dir trt_engines/command_r_v01/int8_wo/1-gpu
|
||||
```
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -27,7 +27,7 @@ This document shows how to build and run an Encoder-Decoder (Enc-Dec) model in T
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc_dec/model.py](../../tensorrt_llm/models/enc_dec/model.py). The TensorRT-LLM Enc-Dec example code is located in [`examples/enc_dec`](./):
|
||||
The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc_dec/model.py](../../../../tensorrt_llm/models/enc_dec/model.py). The TensorRT-LLM Enc-Dec example code is located in [`examples/enc_dec`](./):
|
||||
|
||||
* `trtllm-build` to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Enc-Dec model,
|
||||
* [`run.py`](./run.py) to run the inference on an example input text.
|
||||
@ -202,7 +202,7 @@ Different types of runtime are provided for encoder-decoder models. Following an
|
||||
- Python runtime w/ Static Batching
|
||||
- (NEW) C++ runtime w/ Paged KV Cache and Inflight Batching
|
||||
|
||||
Please refer to the documentation for the details of [paged kv cache](../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../docs/source/advanced/gpt-attention.md#inflight-batching).
|
||||
Please refer to the documentation for the details of [paged kv cache](../../../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../../../docs/source/advanced/gpt-attention.md#inflight-batching).
|
||||
|
||||
#### Run C++ runtime
|
||||
**Note: to use inflight batching and paged kv cache features in C++ runtime, please make sure you have set `--paged_kv_cache enable` (which is by default enabled) in the `trtllm-build` command of the decoder. Meanwhile, if using Python runtime, it is recommended to disable this flag by `--paged_kv_cache disable` to avoid any unnecessary overhead.**
|
||||
@ -213,12 +213,12 @@ For good usability, Python binding of the C++ runtime is provided. You can use t
|
||||
|
||||
```python
|
||||
# Inferencing via python binding of C++ runtime with inflight batching (IFB)
|
||||
python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
|
||||
python3 ../../../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
|
||||
```
|
||||
|
||||
You can specify `--kv_cache_free_gpu_memory_fraction` to control the percentage of free GPU memory to be used by KV cache (by default 0.9), and `--cross_kv_cache_fraction` to control the percentage of KV cache to be used by cross attention (by default 0.5, and rest of the KV cache will be used by self attention).
|
||||
|
||||
For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
|
||||
For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
|
||||
|
||||
#### Run with Triton Backend
|
||||
[Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/encoder_decoder.md) contains the tutorial on how to run encoder-decoder engines with Tritonserver.
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
This document shows how to build and run a [EXAONE](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) model in TensorRT-LLM.
|
||||
|
||||
The TensorRT-LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../tensorrt_llm/models/llama/model.py).
|
||||
The TensorRT-LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
|
||||
See the LLaMA example [`examples/llama`](../llama) for details.
|
||||
|
||||
- [EXAONE](#exaone)
|
||||
@ -113,7 +113,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply FP8 quantization.
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
@ -134,7 +134,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply INT8 SmoothQuant.
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
--qformat int8_sq \
|
||||
@ -154,7 +154,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply INT4 AWQ.
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
@ -173,7 +173,7 @@ Please make sure your system contains a Hopper GPU before trying the commands be
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply W4A8 AWQ.
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
--qformat w4a8_awq \
|
||||
@ -190,7 +190,7 @@ trtllm-build \
|
||||
Test your engine with the [run.py](../run.py) script:
|
||||
|
||||
```bash
|
||||
python3 ../run.py \
|
||||
python3 ../../../run.py \
|
||||
--input_text "When did the first world war end?" \
|
||||
--max_output_len=100 \
|
||||
--tokenizer_dir $HF_MODEL_DIR \
|
||||
@ -198,13 +198,13 @@ python3 ../run.py \
|
||||
|
||||
# Run with 2 GPUs
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python3 ../run.py \
|
||||
python3 ../../../run.py \
|
||||
--input_text "When did the first world war end?" \
|
||||
--max_output_len=100 \
|
||||
--tokenizer_dir $HF_MODEL_DIR \
|
||||
--engine_dir trt_engines/exaone/fp16/2-gpu
|
||||
|
||||
python ../summarize.py \
|
||||
python ../../../summarize.py \
|
||||
--test_trt_llm \
|
||||
--data_type fp16 \
|
||||
--hf_model_dir $HF_MODEL_DIR \
|
||||
@ -91,7 +91,7 @@ Note that we need to download the dataset of MMLU first and the evaluation of MM
|
||||
|
||||
```bash
|
||||
VOCAB_FILE_PATH=/tmp/models/gemma_nv/checkpoints/tmp_vocab.model
|
||||
python3 ../run.py --engine_dir ${ENGINE_PATH} \
|
||||
python3 ../../../run.py --engine_dir ${ENGINE_PATH} \
|
||||
--max_output_len 30 \
|
||||
--vocab_file ${VOCAB_FILE_PATH}
|
||||
|
||||
@ -102,7 +102,7 @@ Output [Text 0 Beam 0]: "chef in the renowned kitchens of Lyon. After honing his
|
||||
* summarize.py
|
||||
|
||||
```bash
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
--max_ite 5 \
|
||||
@ -132,7 +132,7 @@ mv data/data data/mmlu
|
||||
Evaluate on MMLU dataset.
|
||||
|
||||
```bash
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH}
|
||||
|
||||
@ -156,7 +156,7 @@ UNIFIED_CKPT_PATH=/tmp/ckpt/hf/gemma/2b/1-gpu/
|
||||
ENGINE_PATH=/tmp/engines/gemma/2B/bf16/1-gpu/
|
||||
VOCAB_FILE_PATH=gemma-2b/
|
||||
|
||||
python3 ./examples/gemma/convert_checkpoint.py \
|
||||
python3 ./convert_checkpoint.py \
|
||||
--ckpt-type hf \
|
||||
--model-dir ${CKPT_PATH} \
|
||||
--dtype bfloat16 \
|
||||
@ -170,7 +170,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--tokenizer_dir ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -218,7 +218,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -266,7 +266,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -311,7 +311,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -352,7 +352,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -382,7 +382,7 @@ UNIFIED_CKPT_PATH=/tmp/checkpoints/tmp_7b_it_tensorrt_llm/bf16/tp1/
|
||||
ENGINE_PATH=/tmp/gemma/7B/bf16/1-gpu/
|
||||
VOCAB_FILE_PATH=gemma-7b-pytorch/tokenizer.model
|
||||
|
||||
python3 ./examples/gemma/convert_checkpoint.py \
|
||||
python3 ./convert_checkpoint.py \
|
||||
--ckpt-type torch \
|
||||
--model-dir ${CKPT_PATH} \
|
||||
--dtype bfloat16 \
|
||||
@ -396,13 +396,13 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
--max_ite 5
|
||||
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH}
|
||||
|
||||
@ -439,7 +439,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -479,7 +479,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -529,7 +529,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -569,7 +569,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -615,13 +615,13 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 8 \
|
||||
--max_ite 5
|
||||
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH}
|
||||
|
||||
@ -643,7 +643,7 @@ UNIFIED_CKPT_PATH=/tmp/checkpoints/tmp_1b_it_tensorrt_llm/bf16/tp1/
|
||||
ENGINE_PATH=/tmp/gemma3/1b/bf16/1-gpu/
|
||||
VOCAB_FILE_PATH=gemma-3-1b-it/tokenizer.model
|
||||
|
||||
python3 ./examples/gemma/convert_checkpoint.py \
|
||||
python3 ./convert_checkpoint.py \
|
||||
--ckpt-type hf \
|
||||
--model-dir ${CKPT_PATH} \
|
||||
--dtype bfloat16 \
|
||||
@ -657,7 +657,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_seq_len 3100 \
|
||||
--output_dir ${ENGINE_PATH}
|
||||
|
||||
python3 ./examples/summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_PATH} \
|
||||
--engine_dir ${ENGINE_PATH} \
|
||||
--batch_size 1 \
|
||||
@ -685,8 +685,8 @@ Modelopt toolkit also provides quantization solutions. To enable it, have the la
|
||||
|
||||
#### Quantize Checkpoints
|
||||
|
||||
```
|
||||
python ../quantization/quantize.py --model_dir ${HF_GEMMA_PATH} \
|
||||
```bash
|
||||
python ../../../quantization/quantize.py --model_dir ${HF_GEMMA_PATH} \
|
||||
--dtype float16 \
|
||||
--qformat ${QUANT_TYPE} \
|
||||
--output_dir ${UNIFIED_CKPT_PATH} \
|
||||
@ -697,7 +697,7 @@ HF_GEMMA_PATH can either be HF model card name or the downloaded model path. QUA
|
||||
#### Build Engines
|
||||
|
||||
For fp8, build engines with:
|
||||
```
|
||||
```bash
|
||||
trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--max_batch_size 8 \
|
||||
--max_input_len 3000 \
|
||||
@ -707,7 +707,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
|
||||
For int4_awq and int8_sq, build engines with:
|
||||
|
||||
```
|
||||
```bash
|
||||
trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
|
||||
--gemm_plugin auto \
|
||||
--max_batch_size 8 \
|
||||
@ -1,5 +1,5 @@
|
||||
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
|
||||
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
|
||||
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
|
||||
@ -26,15 +26,15 @@ This document explains how to build the [glm-4-9b](https://huggingface.co/THUDM/
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM ChatGLM implementation can be found in [`tensorrt_llm/models/chatglm/model.py`](../../tensorrt_llm/models/chatglm/model.py).
|
||||
The TensorRT-LLM ChatGLM implementation can be found in [`tensorrt_llm/models/chatglm/model.py`](../../../../tensorrt_llm/models/chatglm/model.py).
|
||||
The TensorRT-LLM ChatGLM example code is located in [`examples/glm-4-9b`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
|
||||
@ -154,7 +154,7 @@ If the engines are run successfully, you will see output like (glm-4-9b as the e
|
||||
|
||||
```bash
|
||||
# Run the default engine of GLM-4-9B on single GPU, other model name is available if built.
|
||||
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/fp16/1-gpu
|
||||
@ -165,7 +165,7 @@ python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?"
|
||||
```bash
|
||||
# Run the Tensor Parallel 2 engine of glm_4_9b on two GPU, other model name is available if built.
|
||||
mpirun -n 2 \
|
||||
python ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
python ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/fp16/2-gpu
|
||||
@ -186,7 +186,7 @@ Output [Text 0 Beam 0]: "There is no new information provided in the official do
|
||||
|
||||
```bash
|
||||
# Run the summarization of glm_4_9b task, other model name is available if built.
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/fp16/1-gpu
|
||||
```
|
||||
@ -208,7 +208,7 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/int8_wo/1-gpu \
|
||||
--output_dir trt_engines/glm_4_9b/int8_wo/1-gpu
|
||||
|
||||
# Run inference.
|
||||
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/int8_wo/1-gpu
|
||||
@ -232,7 +232,7 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/sq/1-gpu \
|
||||
--output_dir trt_engines/glm_4_9b/sq/1-gpu
|
||||
|
||||
# Run inference.
|
||||
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/sq/1-gpu
|
||||
@ -240,11 +240,11 @@ python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?"
|
||||
|
||||
### Activation-aware Weight Quantization (AWQ)
|
||||
|
||||
The [`../quantization/quantize.py`](../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
|
||||
The [`quantize.py`](../../../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
|
||||
|
||||
```bash
|
||||
# glm_4_9b: single gpu, int4 awq quantization
|
||||
python ../quantization/quantize.py --model_dir glm_4_9b \
|
||||
python ../../../quantization/quantize.py --model_dir glm_4_9b \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
--output_dir trt_ckpt/glm_4_9b/int4_awq/1-gpu
|
||||
@ -255,7 +255,7 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/int4_awq/1-gpu \
|
||||
--output_dir trt_engines/glm_4_9b/int4_awq/1-gpu
|
||||
|
||||
# Run inference.
|
||||
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/int4_awq/1-gpu
|
||||
@ -263,11 +263,11 @@ python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?"
|
||||
|
||||
### FP8 Quantization
|
||||
|
||||
The [`../quantization/quantize.py`](../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
|
||||
The [`quantize.py`](../../../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
|
||||
|
||||
```bash
|
||||
# glm_4_9b: single gpu, fp8 quantization
|
||||
python ../quantization/quantize.py --model_dir glm_4_9b \
|
||||
python ../../../quantization/quantize.py --model_dir glm_4_9b \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -279,12 +279,8 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/fp8/1-gpu \
|
||||
--output_dir trt_engines/glm_4_9b/fp8/1-gpu
|
||||
|
||||
# Run inference.
|
||||
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir glm_4_9b \
|
||||
--engine_dir trt_engines/glm_4_9b/fp8/1-gpu
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
|
||||
* The TensorRT-LLM ChatGLM benchmark is located in [benchmarks/](../../benchmarks/README.md)
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -37,14 +37,14 @@ This document explains how to build the [GPT](https://huggingface.co/gpt2) model
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM GPT implementation can be found in [`tensorrt_llm/models/gpt/model.py`](../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM GPT example code is located in [`examples/gpt`](./). There is one main file:
|
||||
The TensorRT-LLM GPT implementation can be found in [`tensorrt_llm/models/gpt/model.py`](../../../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM GPT example code is located in [`examples/models/core/gpt`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
* FP16
|
||||
@ -161,7 +161,7 @@ You can build engine(s) using random weights, which is useful for benchmarking.
|
||||
|
||||
```bash
|
||||
# Generate an 8-GPU GPT-175B float16 checkpoint config file.
|
||||
python3 ../generate_checkpoint_config.py --architecture GPTForCausalLM \
|
||||
python3 ../../../generate_checkpoint_config.py --architecture GPTForCausalLM \
|
||||
--vocab_size 51200 \
|
||||
--hidden_size 12288 \
|
||||
--num_hidden_layers 96 \
|
||||
@ -172,7 +172,7 @@ python3 ../generate_checkpoint_config.py --architecture GPTForCausalLM \
|
||||
|
||||
|
||||
# Generate a 16-GPU GPT-530B float16 checkpoint config file.
|
||||
python3 ../generate_checkpoint_config.py --architecture GPTForCausalLM \
|
||||
python3 ../../../generate_checkpoint_config.py --architecture GPTForCausalLM \
|
||||
--vocab_size 51200 \
|
||||
--hidden_size 20480 \
|
||||
--num_hidden_layers 105 \
|
||||
@ -207,10 +207,10 @@ trtllm-build --model_config gpt_530b/trt_ckpt/fp16/16-gpu/config.json \
|
||||
### 5. Run inference
|
||||
#### Single node, single GPU
|
||||
|
||||
The [`../run.py`](../run.py) script can be used to run inference with the built engine(s).
|
||||
The [`run.py`](../../../run.py) script can be used to run inference with the built engine(s).
|
||||
|
||||
```bash
|
||||
python3 ../run.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
|
||||
python3 ../../../run.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
|
||||
--tokenizer_dir gpt2 \
|
||||
--max_output_len 8
|
||||
```
|
||||
@ -222,13 +222,13 @@ Input [Text 0]: "Born in north-east France, Soyer trained as a"
|
||||
Output [Text 0 Beam 0]: " chef before moving to London in the early"
|
||||
```
|
||||
|
||||
The [`../summarize.py`](../summarize.py) script can run the built engines to summarize the articles from the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
The [`summarize.py`](../../../summarize.py) script can run the built engines to summarize the articles from the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
For each summary, the script can compute the
|
||||
[ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)) scores and use the `ROUGE-1` score to validate the implementation.
|
||||
By passing `--test_trt_llm` flag, the script will evaluate TensorRT-LLM engines. You may also pass `--test_hf` flag to evaluate the HF model.
|
||||
|
||||
```bash
|
||||
python3 ../summarize.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
|
||||
python3 ../../../summarize.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
|
||||
--hf_model_dir gpt2 \
|
||||
--test_trt_llm \
|
||||
--test_hf
|
||||
@ -258,13 +258,13 @@ To run engines using multiple GPUs on a single node, you can use `mpirun` as:
|
||||
|
||||
```bash
|
||||
mpirun -np 2 \
|
||||
python3 ../run.py --engine_dir gpt2/trt_engines/fp16/2-gpu \
|
||||
python3 ../../../run.py --engine_dir gpt2/trt_engines/fp16/2-gpu \
|
||||
--tokenizer_dir gpt2 \
|
||||
--max_output_len 8
|
||||
|
||||
# Note that GPT-175B is built with random weights, so the output will also be random
|
||||
mpirun -np 8 \
|
||||
python3 ../run.py --engine_dir gpt_175b/trt_engines/fp16/8-gpu \
|
||||
python3 ../../../run.py --engine_dir gpt_175b/trt_engines/fp16/8-gpu \
|
||||
--max_output_len 8
|
||||
```
|
||||
|
||||
@ -293,7 +293,7 @@ srun --mpi=pmix \
|
||||
--container-workdir <path> \
|
||||
--output logs/tensorrt_llm_%t.out \
|
||||
--error logs/tensorrt_llm_%t.error \
|
||||
python3 -u ../run.py --engine_dir <engine_dir> --max_output_len 8
|
||||
python3 -u ../../../run.py --engine_dir <engine_dir> --max_output_len 8
|
||||
```
|
||||
|
||||
Then, submit the job using:
|
||||
@ -482,11 +482,11 @@ trtllm-build --checkpoint_dir gpt2/trt_ckpt/int4-wo/1-gpu \
|
||||
|
||||
### FP8 Quantization
|
||||
|
||||
[`../quantization/quantize.py`](../quantization/quantize.py) can do FP8 quantization and/or FP8 kv cache quantization, and export TensorRT-LLM checkpoint.
|
||||
[`quantize.py`](../../../quantization/quantize.py) can do FP8 quantization and/or FP8 kv cache quantization, and export TensorRT-LLM checkpoint.
|
||||
|
||||
```bash
|
||||
# FP8 quantization with FP8 kv cache
|
||||
python3 ../quantization/quantize.py --model_dir gpt2 \
|
||||
python3 ../../../quantization/quantize.py --model_dir gpt2 \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -558,7 +558,7 @@ trtllm-build --checkpoint_dir granite/trt_ckpt/fp16/4-gpu \
|
||||
|
||||
# Run inference
|
||||
mpirun -np 4 \
|
||||
python3 ../run.py --engine_dir granite/trt_engines/fp16/4-gpu \
|
||||
python3 ../../../run.py --engine_dir granite/trt_engines/fp16/4-gpu \
|
||||
--tokenizer_dir granite \
|
||||
--input_text "def print_hello_world():" \
|
||||
--max_output_len 20
|
||||
@ -585,7 +585,7 @@ trtllm-build --checkpoint_dir santacoder/trt_ckpt/fp16/4-gpu \
|
||||
|
||||
# Run inference
|
||||
mpirun -np 4 \
|
||||
python3 ../run.py --engine_dir santacoder/trt_engines/fp16/4-gpu \
|
||||
python3 ../../../run.py --engine_dir santacoder/trt_engines/fp16/4-gpu \
|
||||
--tokenizer_dir santacoder \
|
||||
--input_text "def print_hello_world():" \
|
||||
--max_output_len 20
|
||||
@ -613,7 +613,7 @@ trtllm-build --checkpoint_dir starcoder/trt_ckpt/fp16/4-gpu \
|
||||
|
||||
# Run inference
|
||||
mpirun -np 4 \
|
||||
python3 ../run.py --engine_dir starcoder/trt_engines/fp16/4-gpu \
|
||||
python3 ../../../run.py --engine_dir starcoder/trt_engines/fp16/4-gpu \
|
||||
--tokenizer_dir starcoder \
|
||||
--input_text "def print_hello_world():" \
|
||||
--max_output_len 20
|
||||
@ -638,7 +638,7 @@ git-lfs clone https://huggingface.co/KaQyn/peft-lora-starcoder2-15b-unity-copilo
|
||||
* Quantize the StarCoder2 model to fp8 from HF
|
||||
```bash
|
||||
BASE_STARCODER2_MODEL=./starcoder2-15b
|
||||
python ../quantization/quantize.py --model_dir ${BASE_STARCODER2_MODEL} \
|
||||
python ../../../quantization/quantize.py --model_dir ${BASE_STARCODER2_MODEL} \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -647,14 +647,14 @@ python ../quantization/quantize.py --model_dir ${BASE_STARCODER2_MODEL} \
|
||||
```
|
||||
|
||||
* Build engine and run inference.
|
||||
```
|
||||
```bash
|
||||
trtllm-build --checkpoint_dir starcoder2-15b/trt_ckpt/fp8/1-gpu \
|
||||
--output_dir starcoder2-15b/trt_engines/fp8_lora/1-gpu \
|
||||
--gemm_plugin auto \
|
||||
--lora_plugin auto \
|
||||
--lora_dir ./peft-lora-starcoder2-15b-unity-copilot
|
||||
|
||||
python ../run.py --engine_dir starcoder2-15b/trt_engines/fp8_lora/1-gpu \
|
||||
python ../../../run.py --engine_dir starcoder2-15b/trt_engines/fp8_lora/1-gpu \
|
||||
--max_output_len 20 \
|
||||
--tokenizer_dir ${BASE_STARCODER2_MODEL} \
|
||||
--input_text "def print_hello_world():" \
|
||||
@ -685,7 +685,7 @@ trtllm-build --checkpoint_dir gpt-next-2B/trt_ckpt/bf16/1-gpu \
|
||||
--output_dir gpt-next-2B/trt_engines/bf16/1-gpu
|
||||
|
||||
# Run inference
|
||||
python3 ../run.py --engine_dir gpt-next-2B/trt_engines/bf16/1-gpu \
|
||||
python3 ../../../run.py --engine_dir gpt-next-2B/trt_engines/bf16/1-gpu \
|
||||
--vocab_file gpt-next-2B/trt_ckpt/bf16/1-gpu/tokenizer.model \
|
||||
--no_add_special_tokens \
|
||||
--max_output_len 8
|
||||
@ -718,7 +718,7 @@ It'll give you a summary of the different tasks in the table, that you can speci
|
||||
|
||||
Finally, you can run inference on pre-defined tokens:
|
||||
```bash
|
||||
python3 ../run.py --engine_dir gpt-next-8B/trt_engines/fp16/1-gpu \
|
||||
python3 ../../../run.py --engine_dir gpt-next-8B/trt_engines/fp16/1-gpu \
|
||||
--vocab_file gpt-next-8B/trt_ckpt/fp16/1-gpu/tokenizer.model \
|
||||
--no_add_special_tokens \
|
||||
--prompt_table_path email_composition.npy \
|
||||
@ -752,7 +752,7 @@ trtllm-build --checkpoint_dir gpt-next-2B/trt_ckpt/fp16/1-gpu \
|
||||
|
||||
# Run inference directly from NeMo LoRA checkpoint
|
||||
# --lora_task_ids correspond to the index of the models given with --lora_dir. -1 means no LoRA
|
||||
python3 ../run.py --engine_dir gpt-next-2B/trt_engines/fp16/1-gpu \
|
||||
python3 ../../../run.py --engine_dir gpt-next-2B/trt_engines/fp16/1-gpu \
|
||||
--vocab_file gpt-next-2B/trt_ckpt/fp16/1-gpu/tokenizer.model \
|
||||
--no_add_special_tokens \
|
||||
--max_output_len 20 \
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
This document shows how to build and run a [Granite 3.0](https://huggingface.co/collections/ibm-granite/granite-30-language-models-66fdb59bbb54785c3512114f) model in TensorRT-LLM.
|
||||
|
||||
The TensorRT-LLM Granite implementation is based on the LLaMA model, with Mixture of Experts (MoE) enabled. The implementation can be found in [`llama/model.py`](../../tensorrt_llm/models/llama/model.py). See the LLaMA example [`examples/llama`](../llama) for details.
|
||||
The TensorRT-LLM Granite implementation is based on the LLaMA model, with Mixture of Experts (MoE) enabled. The implementation can be found in [`llama/model.py`](../../../../tensorrt_llm/models/llama/model.py). See the LLaMA example [`examples/models/core/llama`](../llama) for details.
|
||||
|
||||
- [Granite 3.0](#Granite)
|
||||
- [Download model checkpoints](#download-model-checkpoints)
|
||||
@ -46,12 +46,12 @@ python3 ../llama/convert_checkpoint.py --model_dir tmp/hf_checkpoints/${HF_MODEL
|
||||
### FP8 PTQ
|
||||
Notes:
|
||||
- Currently quantize.py does not support Expert Parallelism (EP) mode yet. User should use `../llama/convert_checkpoint.py` and specify `--moe_ep_size 1` instead, if needed.
|
||||
- TensorRT-LLM uses static quantization methods, which is expected to be faster at runtime as compared to dynamic quantization methods. This comes at a cost of an offline calibration step during quantization. `batch_size` and `calib_size` can be adjusted to shorten the calibration time. Please refer to `../quantization/README.md` for explanation.
|
||||
- TensorRT-LLM uses static quantization methods, which is expected to be faster at runtime as compared to dynamic quantization methods. This comes at a cost of an offline calibration step during quantization. `batch_size` and `calib_size` can be adjusted to shorten the calibration time. Please refer to `../../../quantization/README.md` for explanation.
|
||||
|
||||
```bash
|
||||
PREC_QUANT="fp8"
|
||||
ENGINE="${HF_MODEL}_${PREC_QUANT}_tp${TP}"
|
||||
python ../quantization/quantize.py --model_dir tmp/hf_checkpoints/${HF_MODEL} \
|
||||
python ../../../quantization/quantize.py --model_dir tmp/hf_checkpoints/${HF_MODEL} \
|
||||
--dtype ${PREC_RAW} \
|
||||
--qformat ${PREC_QUANT} \
|
||||
--kv_cache_dtype ${PREC_QUANT} \
|
||||
@ -74,10 +74,10 @@ trtllm-build --checkpoint_dir ./tmp/tllm_checkpoints/${ENGINE} \
|
||||
```
|
||||
|
||||
## Run Engine
|
||||
Test your engine with the [run.py](../run.py) script:
|
||||
Test your engine with the [run.py](../../../run.py) script:
|
||||
|
||||
```bash
|
||||
mpirun -n ${TP} --allow-run-as-root python ../run.py --engine_dir ./tmp/trt_engines/${ENGINE} --tokenizer_dir tmp/hf_checkpoints/${HF_MODEL} --max_output_len 20 --input_text "The future of AI is"
|
||||
mpirun -n ${TP} --allow-run-as-root python ../../../run.py --engine_dir ./tmp/trt_engines/${ENGINE} --tokenizer_dir tmp/hf_checkpoints/${HF_MODEL} --max_output_len 20 --input_text "The future of AI is"
|
||||
```
|
||||
|
||||
For more usage examples see [`examples/llama/README.md`](../llama/README.md)
|
||||
For more usage examples see [`examples/models/core/llama/README.md`](../llama/README.md)
|
||||
@ -5,16 +5,16 @@ This document shows how to build and run InternLM2 7B / 20B models in TensorRT-L
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM InternLM2 implementation is based on the LLaMA model. The implementation can
|
||||
be found in [model.py](../../tensorrt_llm/models/llama/model.py).
|
||||
be found in [model.py](../../../../tensorrt_llm/models/llama/model.py).
|
||||
The TensorRT-LLM InternLM2 example code lies in [`examples/internlm2`](./):
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) converts the Huggingface Model of InternLM2 into TensorRT-LLM checkpoint.
|
||||
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
* FP16 / BF16
|
||||
@ -23,7 +23,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
|
||||
|
||||
## Usage
|
||||
|
||||
The TensorRT-LLM InternLM2 example code locates at [examples/internlm](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
|
||||
The TensorRT-LLM InternLM2 example code locates at [examples/models/core/internlm2](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
|
||||
|
||||
### Build TensorRT engine(s)
|
||||
|
||||
@ -47,7 +47,7 @@ Here're some examples:
|
||||
# Build a single-GPU float16 engine from HF weights.
|
||||
# gpt_attention_plugin is necessary in InternLM2.
|
||||
# Try use_gemm_plugin to prevent accuracy issue.
|
||||
cd examples/internlm2
|
||||
cd examples/models/core/internlm2
|
||||
|
||||
# Convert the InternLM2 7B model using a single GPU and FP16.
|
||||
python convert_checkpoint.py --model_dir ./internlm2-chat-7b/ \
|
||||
@ -100,7 +100,7 @@ trtllm-build --checkpoint_dir ./internlm2-chat-7b/trt_engines/bf16/2-gpu/ \
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
cd examples/internlm2
|
||||
cd examples/models/core/internlm2
|
||||
|
||||
# For 7B models
|
||||
python convert_checkpoint.py --model_dir ./internlm2-chat-7b \
|
||||
@ -117,7 +117,7 @@ trtllm-build --checkpoint_dir ./internlm2-chat-7b/w8a16 \
|
||||
|
||||
|
||||
```bash
|
||||
cd examples/internlm2
|
||||
cd examples/models/core/internlm2
|
||||
|
||||
# For 20B models
|
||||
python convert_checkpoint.py --model_dir ./internlm2-chat-20b \
|
||||
@ -138,33 +138,33 @@ To run a TensorRT-LLM InternLM2 model using the engines generated by `trtllm-bui
|
||||
|
||||
```bash
|
||||
# InternLM2 7B with fp16
|
||||
python ../run.py --max_output_len=120 \
|
||||
python ../../../run.py --max_output_len=120 \
|
||||
--input_text 'Tell me about yourself.' \
|
||||
--tokenizer_dir ./internlm2-chat-7b/ \
|
||||
--engine_dir=./internlm2-chat-7b/trt_engines/fp16/1-gpu/
|
||||
|
||||
# InternLM2 7B with bf16
|
||||
python ../run.py --max_output_len=120 \
|
||||
python ../../../run.py --max_output_len=120 \
|
||||
--input_text 'Tell me about yourself.' \
|
||||
--tokenizer_dir ./internlm2-chat-7b/ \
|
||||
--engine_dir=./internlm2-chat-7b/trt_engines/bf16/1-gpu/
|
||||
|
||||
# InternLM2 7B with int8 weight only quantization
|
||||
python ../run.py --max_output_len=120 \
|
||||
python ../../../run.py --max_output_len=120 \
|
||||
--input_text 'Tell me about yourself.' \
|
||||
--tokenizer_dir ./internlm2-chat-7b/ \
|
||||
--engine_dir=./internlm2-chat-7b/trt_engines/weight_only/1-gpu/
|
||||
|
||||
# InternLM2 7B with fp16 and tensor parallelism
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../run.py --max_output_len=120 \
|
||||
python ../../../run.py --max_output_len=120 \
|
||||
--input_text 'Tell me about yourself.' \
|
||||
--tokenizer_dir ./internlm2-chat-7b/ \
|
||||
--engine_dir=./internlm2-chat-7b/trt_engines/fp16/2-gpu/
|
||||
|
||||
# InternLM2 20B with fp16 and tensor parallelism and pipeline parallelism
|
||||
mpirun -n 4 --allow-run-as-root \
|
||||
python ../run.py --max_output_len=120 \
|
||||
python ../../../run.py --max_output_len=120 \
|
||||
--input_text 'Tell me about yourself.' \
|
||||
--tokenizer_dir ./internlm2-chat-7b/ \
|
||||
--engine_dir=./internlm2-chat-7b/trt_engines/bf16/4-gpu/
|
||||
@ -174,27 +174,27 @@ mpirun -n 4 --allow-run-as-root \
|
||||
|
||||
```bash
|
||||
# Run summarization using the InternLM2 7B model in FP16.
|
||||
python ../summarize.py --test_trt_llm --test_hf \
|
||||
python ../../../summarize.py --test_trt_llm --test_hf \
|
||||
--hf_model_dir ./internlm2-chat-7b/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./engine_outputs
|
||||
|
||||
# Run summarization using the InternLM2 7B model quantized to w8a16.
|
||||
python ../summarize.py --test_trt_llm --test_hf \
|
||||
python ../../../summarize.py --test_trt_llm --test_hf \
|
||||
--hf_model_dir ./internlm2-chat-7b/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./engine_outputs
|
||||
|
||||
# Run summarization using the InternLM2 7B model in FP16 using two GPUs.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm --test_hf \
|
||||
python ../../../summarize.py --test_trt_llm --test_hf \
|
||||
--hf_model_dir ./internlm2-chat-7b/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./internlm2-chat-7b/trt_engines/fp16/2-gpu/
|
||||
|
||||
# Run summarization using the InternLM2 20B model in BF16 using 4 GPUs.
|
||||
mpirun -n 4 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm --test_hf \
|
||||
python ../../../summarize.py --test_trt_llm --test_hf \
|
||||
--hf_model_dir ./internlm2-chat-20b/ \
|
||||
--data_type bf16 \
|
||||
--engine_dir ./internlm2-chat-20b/trt_engines/bf16/4-gpu/
|
||||
@ -40,14 +40,14 @@ This document shows how to build and run a LLaMA model in TensorRT-LLM on both s
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM LLaMA implementation can be found in [tensorrt_llm/models/llama/model.py](../../tensorrt_llm/models/llama/model.py). The TensorRT-LLM LLaMA example code is located in [`examples/llama`](./). There is one main file:
|
||||
The TensorRT-LLM LLaMA implementation can be found in [tensorrt_llm/models/llama/model.py](../../../../tensorrt_llm/models/llama/model.py). The TensorRT-LLM LLaMA example code is located in [`examples/models/core/llama`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the LLaMA model into tensorrt-llm checkpoint format.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
* BF16/FP16
|
||||
@ -63,7 +63,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
|
||||
|
||||
## Usage
|
||||
|
||||
The TensorRT-LLM LLaMA example code locates at [examples/llama](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
|
||||
The TensorRT-LLM LLaMA example code locates at [examples/models/core/llama](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
|
||||
|
||||
### Build TensorRT engine(s)
|
||||
|
||||
@ -316,7 +316,7 @@ awk '{printf "%s\\n", $0} END {printf "\\nSummarize this story:"}' pg64317.txt >
|
||||
# Notice, `--max_input_length <n>` is a convenience option to limit the input length for the data.
|
||||
# It should be set to the maximum context length the model supports. Here the limit is set to 32K.
|
||||
mpirun -n 8 --allow-run-as-root \
|
||||
python ../run.py \
|
||||
python ../../../run.py \
|
||||
--max_output_len 128 \
|
||||
--max_input_length 32768 \
|
||||
--input_file pg64317_sanitized.txt \
|
||||
@ -355,7 +355,7 @@ git-lfs clone https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-104
|
||||
To evaluate the PPL of very long context, we need to enable `use_paged_context_fmha` and setup `max_num_tokens` to enable the chunked context inference, reducing the activation memory requirement. Also, we need to enable `gather_context_logits` to return the logits to compute the PPL.
|
||||
|
||||
```bash
|
||||
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
|
||||
--output_dir /tmp/llama-3-8B-1048k/trt_ckpts \
|
||||
--dtype float16
|
||||
|
||||
@ -420,7 +420,7 @@ Prepare input data and run evaluation.
|
||||
```bash
|
||||
python examples/infinitebench/construct_synthetic_dataset.py --test_case build_kv_retrieval --test_level 0
|
||||
|
||||
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
|
||||
--output_dir /tmp/llama-3-8B-1048k/trt_ckpts \
|
||||
--dtype float16 \
|
||||
--tp_size 1
|
||||
@ -470,7 +470,7 @@ python examples/infinitebench/construct_synthetic_dataset.py --test_case build_p
|
||||
```bash
|
||||
git-lfs clone https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k/
|
||||
|
||||
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
|
||||
--output_dir /tmp/llama-3-8B-1048k/trt_ckpts \
|
||||
--dtype float16 \
|
||||
--tp_size 4
|
||||
@ -500,7 +500,7 @@ For the 70B model, at least 8 A100 80GB GPUs are required.
|
||||
```bash
|
||||
git-lfs clone https://huggingface.co/gradientai/Llama-3-70B-Instruct-Gradient-1048k/
|
||||
|
||||
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-70B-Instruct-Gradient-1048k/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-70B-Instruct-Gradient-1048k/ \
|
||||
--output_dir /tmp/llama-3-70B-1048k/trt_ckpts \
|
||||
--dtype float16 \
|
||||
--tp_size 8
|
||||
@ -570,10 +570,10 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_int8_kv_wq \
|
||||
--gemm_plugin auto
|
||||
```
|
||||
|
||||
Test with `../summarize.py`:
|
||||
Test with `summarize.py`:
|
||||
|
||||
```bash
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./llama-models/llama-7b-hf \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_weight_only/1-gpu \
|
||||
@ -585,7 +585,7 @@ python ../summarize.py --test_trt_llm \
|
||||
In addition, you can enable INT8 KV cache together with AWQ (per-group INT4 weight-only quantization)like the following command.
|
||||
|
||||
```bash
|
||||
python ../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
|
||||
python ../../../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
|
||||
--output_dir ./tllm_checkpoint_1gpu_awq_int8_kv_cache \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
@ -598,10 +598,10 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_awq_int8_kv_cache \
|
||||
--gemm_plugin auto \
|
||||
```
|
||||
|
||||
Test with `../summarize.py`:
|
||||
Test with `summarize.py`:
|
||||
|
||||
```bash
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir /tmp/llama-7b-hf \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_int4_AWQ/1-gpu \
|
||||
@ -652,7 +652,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
|
||||
|
||||
```bash
|
||||
# Quantize HF LLaMA 70B into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir ./tmp/llama/70B \
|
||||
python ../../../quantization/quantize.py --model_dir ./tmp/llama/70B \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -676,7 +676,7 @@ Experimental: use FP8 GEMV to optimize performance in FP8 small-batch-size cases
|
||||
|
||||
```bash
|
||||
# Quantize HF LLaMA 7B into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
|
||||
python ../../../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -711,7 +711,7 @@ AWQ/GPTQ examples below involves 2 steps:
|
||||
|
||||
```bash
|
||||
# Quantize HF LLaMA 7B checkpoint into INT4 AWQ format
|
||||
python ../quantization/quantize.py --model_dir ./tmp/llama-7b-hf \
|
||||
python ../../../quantization/quantize.py --model_dir ./tmp/llama-7b-hf \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
--awq_block_size 128 \
|
||||
@ -868,12 +868,12 @@ To run a TensorRT-LLM LLaMA model using the engines generated by `trtllm-build`
|
||||
|
||||
```bash
|
||||
# With fp16 inference
|
||||
python3 ../run.py --max_output_len=50 \
|
||||
python3 ../../../run.py --max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/llama/7B/ \
|
||||
--engine_dir=./tmp/llama/7B/trt_engines/fp16/1-gpu/
|
||||
|
||||
# With bf16 inference
|
||||
python3 ../run.py --max_output_len=50 \
|
||||
python3 ../../../run.py --max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/llama/7B/ \
|
||||
--engine_dir=./tmp/llama/7B/trt_engines/bf16/1-gpu/
|
||||
```
|
||||
@ -910,7 +910,7 @@ To run the LLaMA 70B model on 2 nodes via Slurm, you need to prepare a Slurm scr
|
||||
srun --container-image=<docker-image> \
|
||||
--mpi=pmix \
|
||||
... \ # more srun options here
|
||||
python3 ../run.py --max_output_len=50 \
|
||||
python3 ../../../run.py --max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/llama/70B/hf/ \
|
||||
--engine_dir=./tmp/llama/70B/trt_engines/fp16/16-gpu/
|
||||
```
|
||||
@ -923,27 +923,27 @@ Considering the Slurm or other cluster management systems may be highly customiz
|
||||
|
||||
```bash
|
||||
# Run summarization using the LLaMA 7B model in FP16.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/llama/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/llama/7B/trt_engines/fp16/1-gpu/
|
||||
|
||||
# Run summarization using the LLaMA 7B model quantized to INT8.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/llama/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/llama/7B/trt_engines/weight_only/1-gpu/
|
||||
|
||||
# Run summarization using the LLaMA 7B model in FP16 using two GPUs.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/llama/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/llama/7B/trt_engines/fp16/2-gpu/
|
||||
|
||||
# Run summarization using the LLaMA 30B model in FP16 using two GPUs.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/llama/30B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/llama/30B/trt_engines/fp16/2-gpu/
|
||||
@ -965,7 +965,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_mistral \
|
||||
--max_input_len 32256
|
||||
|
||||
# Run Mistral 7B fp16 inference with sliding window/cache size 4096
|
||||
python ../run.py --max_output_len=50 \
|
||||
python ../../../run.py --max_output_len=50 \
|
||||
--tokenizer_dir ./mistral-7b-v0.1 \
|
||||
--engine_dir=./tmp/mistral/7B/trt_engines/fp16/1-gpu/ \
|
||||
--max_attention_window_size=4096
|
||||
@ -994,7 +994,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_mistral_nemo \
|
||||
--max_input_len 10240
|
||||
|
||||
# Run summarization using the Mistral Nemo model quantized to INT8.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./Mistral-Nemo-Instruct-2407 \
|
||||
--data_type bf16 \
|
||||
--engine_dir ./tmp/mistral_nemo/trt_engines/bf16/1-gpu//
|
||||
@ -1024,7 +1024,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
|
||||
|
||||
```bash
|
||||
# Quantize HF CodeLlama 7B into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir /tmp/CodeLlama-7b-Instruct-hf \
|
||||
python ../../../quantization/quantize.py --model_dir /tmp/CodeLlama-7b-Instruct-hf \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -1070,13 +1070,13 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_4gpu_codellama \
|
||||
|
||||
### Run
|
||||
Use the following command to run the 7b engine from above:
|
||||
```
|
||||
```bash
|
||||
python ../run.py --max_output_len=40 --tokenizer_dir . --engine_dir codellama_7b --input_text "In Bash, how do I list all text files?"
|
||||
```
|
||||
Use the following command to run the 34b engine with long input/output from above:
|
||||
```
|
||||
```bash
|
||||
mpirun -n 8 --allow-run-as-root \
|
||||
python ../run.py --max_output_len=160 --tokenizer_dir ./CodeLlama-34b-Instruct \
|
||||
python ../../../run.py --max_output_len=160 --tokenizer_dir ./CodeLlama-34b-Instruct \
|
||||
--engine_dir codellama_34b --input_text "In python, write a function for binary searching an element in an integer array."
|
||||
```
|
||||
|
||||
@ -1110,7 +1110,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_2gpu \
|
||||
Run inference. Remember to use lora tokenizer because lora model has larger vocab size.
|
||||
|
||||
```bash
|
||||
mpirun -n 2 python ../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
|
||||
mpirun -n 2 python ../../../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir "chinese-llama-2-lora-13b/" \
|
||||
--input_text "今天天气很好,我到公园的时候," \
|
||||
@ -1128,7 +1128,7 @@ different. Since the LoRA tokenizer, embedding and LM head are still used,
|
||||
the results will also be different with vanilla LLaMA and significantly degrade compared with `--lora_task_uids 0`.
|
||||
|
||||
```bash
|
||||
mpirun -n 2 python ../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
|
||||
mpirun -n 2 python ../../../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir "chinese-llama-2-lora-13b/" \
|
||||
--input_text "今天天气很好,我到公园的时候," \
|
||||
@ -1178,7 +1178,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu \
|
||||
--max_lora_rank 8 \
|
||||
--lora_target_modules attn_q attn_k attn_v
|
||||
|
||||
python ../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/" \
|
||||
python ../../../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/" \
|
||||
--max_output_len 10 \
|
||||
--tokenizer_dir ${BASE_LLAMA_MODEL} \
|
||||
--input_text "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" \
|
||||
@ -1228,7 +1228,7 @@ git-lfs clone https://huggingface.co/davidkim205/komt-mistral-7b-v1-lora
|
||||
* Quantize the Mistral v0.1 model to fp8 from HF
|
||||
```bash
|
||||
BASE_MISTRAL_MODEL=komt-mistral-7b-v1/
|
||||
python ../quantization/quantize.py --model_dir ${BASE_MISTRAL_MODEL} \
|
||||
python ../../../quantization/quantize.py --model_dir ${BASE_MISTRAL_MODEL} \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -1247,7 +1247,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8 \
|
||||
--max_seq_len 33280 \
|
||||
--lora_dir ./komt-mistral-7b-v1-lora
|
||||
|
||||
python ../run.py --max_output_len=1024 \
|
||||
python ../../../run.py --max_output_len=1024 \
|
||||
--tokenizer_dir ./komt-mistral-7b-v1 \
|
||||
--engine_dir=/tmp/mistral_komt_lora/7B/trt_engines/fp8/1-gpu/ \
|
||||
--input_text "[INST]오늘은 날씨가 아주 좋다 내가 공원에 갔을 때 [/INST]" \
|
||||
@ -1274,7 +1274,7 @@ TensorRT-LLM can also support Quantized base model + FP16/BF16 LoRA. We can firs
|
||||
* Quantize the llama model to INT4-AWQ from HF
|
||||
```bash
|
||||
BASE_LLAMA_MODEL=llama-7b-hf/
|
||||
python ../quantization/quantize.py --model_dir ${BASE_LLAMA_MODEL} \
|
||||
python ../../../quantization/quantize.py --model_dir ${BASE_LLAMA_MODEL} \
|
||||
--output_dir ./tllm_checkpoint_1gpu_awq \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
@ -1298,7 +1298,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_awq \
|
||||
--max_lora_rank 8 \
|
||||
--lora_target_modules attn_q attn_k attn_v
|
||||
|
||||
python ../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/int4_AWQ/1-gpu/" \
|
||||
python ../../../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/int4_AWQ/1-gpu/" \
|
||||
--max_output_len 10 \
|
||||
--tokenizer_dir ${BASE_LLAMA_MODEL} \
|
||||
--input_text "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" \
|
||||
@ -1353,7 +1353,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_streamlingllm \
|
||||
|
||||
```bash
|
||||
# Run LLaMA 7B fp16 inference with sliding window/cache size 2048 and sink token length 4.
|
||||
python3 ../run.py --max_output_len=50 \
|
||||
python3 ../../../run.py --max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/llama/7B/ \
|
||||
--engine_dir=./tmp/llama/7B/trt_engines/fp16_StreamingLLM/1-gpu/ \
|
||||
--max_attention_window_size=2048 \
|
||||
@ -1377,7 +1377,7 @@ Note: For 405B HF model cloned before 09 Aug 2024, there are duplicated kv head
|
||||
|
||||
```bash
|
||||
# Run BF16 model by BF16
|
||||
python examples/llama/convert_checkpoint.py --meta_ckpt_dir llama_3.1_405B_meta_model/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --meta_ckpt_dir llama_3.1_405B_meta_model/ \
|
||||
--output_dir llama_3.1_405B_meta_model/trt_ckpts/tp8-pp2/ \
|
||||
--dtype bfloat16 \
|
||||
--tp_size 8 \
|
||||
@ -1386,7 +1386,7 @@ python examples/llama/convert_checkpoint.py --meta_ckpt_dir llama_3.1_405B_meta_
|
||||
--workers 2
|
||||
|
||||
# Run BF16 model by FP8
|
||||
python examples/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_model/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_model/ \
|
||||
--output_dir llama_3.1_405B_HF_model/trt_ckpts/tp8-pp1/ \
|
||||
--dtype bfloat16 \
|
||||
--use_fp8_rowwise \
|
||||
@ -1400,7 +1400,7 @@ python examples/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_model/
|
||||
# Optionally enable --use_meta_fp8_rowwise_recipe to strictly follow the original Meta's LLaMA 3.1 recipe:
|
||||
# (1) Skip quantization for the first and last Transformer layers
|
||||
# (2) Skip quantization for the Attention layers
|
||||
python examples/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_FP8_model/ \
|
||||
python examples/models/core/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_FP8_model/ \
|
||||
--output_dir llama_3.1_405B_HF_FP8_model/trt_ckpts/tp8-pp1/ \
|
||||
--dtype bfloat16 \
|
||||
--tp_size 8 \
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers>=4.43.0
|
||||
datasets==3.1.0
|
||||
@ -13,14 +13,14 @@ This document shows how to build and run a [Mamba](https://github.com/state-spac
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Mamba implementation can be found in [`tensorrt_llm/models/mamba/model.py`](../../tensorrt_llm/models/mamba/model.py). The TensorRT-LLM Mamba example code is located in [`examples/mamba`](./). There is one main file:
|
||||
The TensorRT-LLM Mamba implementation can be found in [`tensorrt_llm/models/mamba/model.py`](../../../../tensorrt_llm/models/mamba/model.py). The TensorRT-LLM Mamba example code is located in [`examples/models/core/mamba`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
|
||||
## Support Matrix
|
||||
@ -182,35 +182,35 @@ The following section describes how to run a TensorRT-LLM Mamba model to summari
|
||||
|
||||
```bash
|
||||
# mamba-2.8b
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./mamba_model/mamba-2.8b/ \
|
||||
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
|
||||
--data_type bf16 \
|
||||
--engine_dir ./mamba_model/mamba-2.8b/trt_engines/bf16/1-gpu/
|
||||
|
||||
# mamba-130m
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./mamba_model/mamba-130m/ \
|
||||
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./mamba_model/mamba-130m/trt_engines/fp16/1-gpu/
|
||||
|
||||
# mamba2-2.7b
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./mamba_model/mamba2-2.7b/ \
|
||||
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./mamba_model/mamba2-2.7b/trt_engines/fp16/1-gpu/
|
||||
|
||||
# mamba2-130m
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./mamba_model/mamba2-130m/ \
|
||||
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./mamba_model/mamba2-130m/trt_engines/fp16/1-gpu/
|
||||
|
||||
# mamba-codestral-7B-v0.1
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
|
||||
--tokenizer_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
|
||||
--data_type fp16 \
|
||||
@ -218,7 +218,7 @@ python ../summarize.py --test_trt_llm \
|
||||
|
||||
# mamba-codestral-7B-v0.1 with 2-way tensor parallelism.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
|
||||
--tokenizer_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
|
||||
--data_type fp16 \
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers>=4.39.0
|
||||
datasets==3.1.0
|
||||
@ -7,8 +7,8 @@ sufficient.
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Mixtral implementation is based on the LLaMA model, with Mixture of Experts enabled. The implementation can
|
||||
be found in [tensorrt_llm/models/llama/model.py](../../tensorrt_llm/models/llama/model.py).
|
||||
See the LLaMA example [`examples/llama`](../llama) for details.
|
||||
be found in [tensorrt_llm/models/llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
|
||||
See the LLaMA example [`examples/models/core/llama`](../llama) for details.
|
||||
|
||||
### Build TensorRT engine(s)
|
||||
|
||||
@ -74,13 +74,13 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_8gpu \
|
||||
--gemm_plugin float16
|
||||
```
|
||||
|
||||
Then, you can test your engine with the [run.py](../run.py) script:
|
||||
Then, you can test your engine with the [run.py](../../../run.py) script:
|
||||
|
||||
```bash
|
||||
mpirun -n 2 python3 ../run.py --engine_dir ./trt_engines/mixtral/tp2 --tokenizer_dir ./Mixtral-8x7B-v0.1 --max_output_len 8 --input_text "I love french quiche"
|
||||
mpirun -n 2 python3 ../../../run.py --engine_dir ./trt_engines/mixtral/tp2 --tokenizer_dir ./Mixtral-8x7B-v0.1 --max_output_len 8 --input_text "I love french quiche"
|
||||
```
|
||||
|
||||
For more examples see [`examples/llama/README.md`](../llama/README.md)
|
||||
For more examples see [`examples/models/core/llama/README.md`](../llama/README.md)
|
||||
|
||||
### Parallelism Modes
|
||||
|
||||
@ -129,7 +129,7 @@ of the different top-k values.
|
||||
- 2 (SPARSE_MIXER) corresponds to: `scales = sparsemixer(routing values)`
|
||||
|
||||
Mixtral uses `RENORM` mode, this is set as the default. To use a different mode use the `--moe_normalization_mode` flag.
|
||||
See [tensorrt_llm/layers/moe.py](../../tensorrt_llm/layers/moe.py#L56) for available values
|
||||
See [tensorrt_llm/layers/moe.py](../../../../tensorrt_llm/layers/moe.py#L56) for available values
|
||||
|
||||
|
||||
## Quantization
|
||||
@ -153,11 +153,11 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \
|
||||
|
||||
### FP8 Post-Training Quantization
|
||||
|
||||
Mixtral supports FP8 quantization, using Modelopt. See [`examples/llama/README.md`](../llama/README.md#fp8-post-training-quantization) for full details on installing Modelopt
|
||||
Mixtral supports FP8 quantization, using Modelopt. See [`examples/models/core/llama/README.md`](../llama/README.md#fp8-post-training-quantization) for full details on installing Modelopt
|
||||
|
||||
```bash
|
||||
# Quantize HF Mixtral into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
|
||||
python ../../../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -194,7 +194,7 @@ Mixtral supports NVFP4 quantization.
|
||||
|
||||
```bash
|
||||
# Quantize HF Mixtral into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
|
||||
python ../../../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
|
||||
--dtype float16 \
|
||||
--qformat nvfp4 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
transformers==4.38.2
|
||||
accelerate==0.25.0
|
||||
@ -62,7 +62,7 @@ Not all models supports end-to-end `cpp` mode, the checked ones below are suppor
|
||||
This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed when switching the LLM backbone.
|
||||
|
||||
1. Download Huggingface weights and convert original checkpoint to TRT-LLM checkpoint format
|
||||
following example in `examples/opt/README.md` and `examples/enc_dec/README.md`.
|
||||
following example in `examples/models/contrib/opt/README.md` and `examples/models/core/enc_dec/README.md`.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="blip2-opt-2.7b" # options: blip2-opt-6.7b, blip2-flan-t5-xl, blip2-flan-t5-xxl
|
||||
@ -71,7 +71,7 @@ This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed
|
||||
|
||||
For BLIP2-OPT family,
|
||||
```bash
|
||||
python ../opt/convert_checkpoint.py --model_type blip2 \
|
||||
python ../../contrib/opt/convert_checkpoint.py --model_type blip2 \
|
||||
--model_dir tmp/hf_models/${MODEL_NAME} \
|
||||
--output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \
|
||||
--dtype float16
|
||||
@ -168,7 +168,7 @@ This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed
|
||||
|
||||
5. (Optional) INT8/INT4 weight-only quantization for OPT can be enabled using commands as follows (take `INT4` as an example, while `INT8` is the default precision for weight-only quantization):
|
||||
```bash
|
||||
python ../opt/convert_checkpoint.py \
|
||||
python ../../contrib/opt/convert_checkpoint.py \
|
||||
--model_dir tmp/hf_models/${MODEL_NAME} \
|
||||
--dtype float16 \
|
||||
--output_dir tmp/trt_models/${MODEL_NAME}/int4_weightonly/1-gpu \
|
||||
@ -216,7 +216,7 @@ Currently, CogVLM only support bfloat16 precision.
|
||||
CogVLM uses a Vit encoder as LLM encoder and a modified Llama as decoder.
|
||||
|
||||
```bash
|
||||
python ../cogvlm/convert_checkpoint.py --model_dir tmp/hf_models/${MODEL_NAME} --output_dir tmp/trt_models/${MODEL_NAME} --dtype bfloat16 --use_prompt_tuning
|
||||
python ../../contrib/cogvlm/convert_checkpoint.py --model_dir tmp/hf_models/${MODEL_NAME} --output_dir tmp/trt_models/${MODEL_NAME} --dtype bfloat16 --use_prompt_tuning
|
||||
|
||||
trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME} \
|
||||
--output_dir tmp/trt_engines/${MODEL_NAME}/bf16/1-gpu/llm \
|
||||
@ -461,7 +461,7 @@ Firstly, please install transformers with 4.37.2
|
||||
|
||||
```bash
|
||||
# FP8 quantization
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir tmp/hf_models/${MODEL_NAME} \
|
||||
--output_dir tmp/trt_models/${MODEL_NAME}/fp8/1-gpu \
|
||||
--dtype bfloat16 \
|
||||
@ -469,7 +469,7 @@ Firstly, please install transformers with 4.37.2
|
||||
--kv_cache_dtype fp8
|
||||
|
||||
# INT8 SmoothQuant quantization
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir tmp/hf_models/${MODEL_NAME} \
|
||||
--output_dir tmp/trt_models/${MODEL_NAME}/int8/1-gpu \
|
||||
--dtype bfloat16 \
|
||||
@ -710,7 +710,7 @@ Firstly, please install transformers with 4.37.2
|
||||
--weight_only_precision int4
|
||||
|
||||
# INT4 AWQ
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir tmp/hf_models/${MODEL_NAME} \
|
||||
--output_dir tmp/trt_models/${MODEL_NAME}/int4_awq/1-gpu \
|
||||
--dtype float16 \
|
||||
@ -737,7 +737,7 @@ For LLaMA-3.2 text model, please refer to the [examples/llama/README.md](../llam
|
||||
* build engine of vision encoder model
|
||||
|
||||
```bash
|
||||
python examples/multimodal/build_multimodal_engine.py --model_type mllama \
|
||||
python examples/models/core/multimodal/build_multimodal_engine.py --model_type mllama \
|
||||
--model_path Llama-3.2-11B-Vision/ \
|
||||
--output_dir /tmp/mllama/trt_engines/vision/
|
||||
```
|
||||
@ -745,7 +745,7 @@ python examples/multimodal/build_multimodal_engine.py --model_type mllama \
|
||||
* build engine of decoder model
|
||||
|
||||
```bash
|
||||
python examples/mllama/convert_checkpoint.py --model_dir Llama-3.2-11B-Vision/ \
|
||||
python examples/models/core/mllama/convert_checkpoint.py --model_dir Llama-3.2-11B-Vision/ \
|
||||
--output_dir /tmp/mllama/trt_ckpts \
|
||||
--dtype bfloat16
|
||||
|
||||
@ -765,7 +765,7 @@ Note that for instruct Vision model, please set the `max_encoder_input_len` as `
|
||||
* Run test on multimodal/run.py with C++ runtime (LLM part only)
|
||||
|
||||
```bash
|
||||
python3 examples/multimodal/run.py --engine_dir /tmp/mllama/trt_engines/ \
|
||||
python3 examples/models/core/multimodal/run.py --engine_dir /tmp/mllama/trt_engines/ \
|
||||
--hf_model_dir Llama-3.2-11B-Vision/ \
|
||||
--image_path https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg \
|
||||
--input_text "<|image|><|begin_of_text|>If I had to write a haiku for this one" \
|
||||
@ -774,7 +774,7 @@ python3 examples/multimodal/run.py --engine_dir /tmp/mllama/trt_engines/ \
|
||||
|
||||
Use model_runner_cpp by default. To switch to model_runner, set `--session python` in the command mentioned above.
|
||||
|
||||
python3 examples/multimodal/eval.py \
|
||||
python3 examples/models/core/multimodal/eval.py \
|
||||
--engine_dir /tmp/mllama/trt_engines/ \
|
||||
--hf_model_dir Llama-3.2-11B-Vision/ \
|
||||
--test_trtllm \
|
||||
@ -810,14 +810,14 @@ trtllm-build --checkpoint_dir /tmp/llama-3.2-11B-Vision/fp8/ \
|
||||
|
||||
# copy visiual engine directory `/tmp/mllama/trt_engines/vision/` to fp8 engine directory `/tmp/trt_engines/llama-3.2-11B-Vision/fp8/vision`
|
||||
|
||||
python3 examples/multimodal/run.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
|
||||
python3 examples/models/core/multimodal/run.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
|
||||
--hf_model_dir Llama-3.2-11B-Vision/ \
|
||||
--image_path https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg \
|
||||
--input_text "<|image|><|begin_of_text|>If I had to write a haiku for this one" \
|
||||
--max_new_tokens 50 \
|
||||
--batch_size 2
|
||||
|
||||
python3 examples/multimodal/eval.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
|
||||
python3 examples/models/core/multimodal/eval.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
|
||||
--hf_model_dir Llama-3.2-11B-Vision/ \
|
||||
--test_trtllm \
|
||||
--accuracy_threshold 65 \
|
||||
@ -1049,7 +1049,7 @@ pip install -r requirements-qwen2vl.txt
|
||||
```bash
|
||||
pip install decord # used for loading video
|
||||
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--nemo_ckpt_path /path/to/nemotron/model.nemo \
|
||||
--dtype bfloat16 \
|
||||
--batch_size 64 \
|
||||
@ -14,12 +14,12 @@ This document demonstrates how to build the Nemotron models using TensorRT-LLM a
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Nemotron implementation is based on the GPT model, which can be found in [`tensorrt_llm/models/gpt/model.py`](../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM Nemotron example is located in [`examples/nemotron`](./).
|
||||
The TensorRT-LLM Nemotron implementation is based on the GPT model, which can be found in [`tensorrt_llm/models/gpt/model.py`](../../../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM Nemotron example is located in [`examples/models/core/nemotron`](./).
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
* FP16/BF16
|
||||
@ -61,7 +61,7 @@ git clone https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-rlhf
|
||||
```
|
||||
|
||||
### Build TensorRT engine(s)
|
||||
The [`examples/quantization/quantize.py`](../quantization/quantize.py) script can quantize the Nemotron models and export to TensorRT-LLM checkpoints. You may optionally skip the quantization step by specifying `--qformat full_prec` and thus export float16 or bfloat16 TensorRT-LLM checkpoints.
|
||||
The [`examples/quantization/quantize.py`](../../../quantization/quantize.py) script can quantize the Nemotron models and export to TensorRT-LLM checkpoints. You may optionally skip the quantization step by specifying `--qformat full_prec` and thus export float16 or bfloat16 TensorRT-LLM checkpoints.
|
||||
|
||||
The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpoints. The number of engine files is same to the number of GPUs used to run inference. Normally, `trtllm-build` uses one GPU by default, but if you have already more GPUs available at build time, you may enable parallel builds to make the engine building process faster by adding the `--workers` argument.
|
||||
|
||||
@ -69,7 +69,7 @@ Here are some examples:
|
||||
|
||||
```bash
|
||||
# single gpu, dtype bfloat16
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
|
||||
--dtype bfloat16 \
|
||||
--batch_size 64 \
|
||||
@ -84,7 +84,7 @@ trtllm-build --checkpoint_dir nemotron-3-8b/trt_ckpt/bf16/1-gpu \
|
||||
|
||||
```bash
|
||||
# 2-way tensor parallelism
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
|
||||
--dtype bfloat16 \
|
||||
--batch_size 64 \
|
||||
@ -102,7 +102,7 @@ trtllm-build --checkpoint_dir nemotron-3-8b/trt_ckpt/bf16/tp2 \
|
||||
```bash
|
||||
# 2-way tensor parallelism for both calibration and inference
|
||||
mpirun -np 2 \
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
|
||||
--dtype bfloat16 \
|
||||
--batch_size 64 \
|
||||
@ -124,7 +124,7 @@ Quantize the Nemotron models to FP8 by specifying `--qformat fp8` to `quantize.p
|
||||
|
||||
```bash
|
||||
# single gpu, fp8 quantization
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
|
||||
--dtype bfloat16 \
|
||||
--batch_size 64 \
|
||||
@ -142,7 +142,7 @@ Quantize the Nemotron models using INT4 AWQ by specifying `--qformat int4_awq` t
|
||||
|
||||
```bash
|
||||
# single gpu, int4 awq quantization
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
|
||||
--dtype bfloat16 \
|
||||
--batch_size 64 \
|
||||
@ -156,19 +156,19 @@ trtllm-build --checkpoint_dir nemotron-3-8b/trt_ckpt/int4_awq/1-gpu \
|
||||
|
||||
### Run Inference
|
||||
|
||||
The `../summarize.py` script can run the built engines to summarize the articles from the
|
||||
The `summarize.py` script can run the built engines to summarize the articles from the
|
||||
[cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
```bash
|
||||
# single gpu
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--no_add_special_tokens \
|
||||
--engine_dir nemotron-3-8b/trt_engines/bf16/1-gpu \
|
||||
--vocab_file nemotron-3-8b/trt_ckpt/bf16/1-gpu/tokenizer.model
|
||||
|
||||
# multiple gpus
|
||||
mpirun -np 2 \
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--no_add_special_tokens \
|
||||
--engine_dir nemotron-3-8b/trt_engines/bf16/tp2 \
|
||||
--vocab_file nemotron-3-8b/trt_ckpt/bf16/tp2/tokenizer.model
|
||||
@ -207,7 +207,7 @@ trtllm-build --checkpoint_dir minitron/trt_ckpt/bf16/1-gpu \
|
||||
--output_dir minitron/trt_engines/bf16/1-gpu
|
||||
|
||||
# Run inference
|
||||
python3 ../run.py --engine_dir minitron/trt_engines/bf16/1-gpu \
|
||||
python3 ../../../run.py --engine_dir minitron/trt_engines/bf16/1-gpu \
|
||||
--tokenizer_dir Minitron-4B-Base \
|
||||
--input_text "def print_hello_world():" \
|
||||
--max_output_len 20
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
nemo-toolkit[all]==2.0.0rc1
|
||||
megatron-core @ git+https://github.com/NVIDIA/Megatron-LM@core_r0.8.0
|
||||
@ -12,7 +12,7 @@ This document shows how to convert and build a model generated by Nemotron-NAS,
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Nemotron-NAS implementation can be found in [tensorrt_llm/models/nemotron_nas/model.py](../../tensorrt_llm/models/nemotron_nas/model.py). The TensorRT-LLM Nemotron-NAS example code is located in [`examples/nemotron_nas`](./). There is one main file:
|
||||
The TensorRT-LLM Nemotron-NAS implementation can be found in [tensorrt_llm/models/nemotron_nas/model.py](../../../../tensorrt_llm/models/nemotron_nas/model.py). The TensorRT-LLM Nemotron-NAS example code is located in [`examples/models/core/nemotron_nas`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the model into tensorrt-llm checkpoint format.
|
||||
|
||||
@ -93,7 +93,7 @@ In particular, the plugin-related options have two categories:
|
||||
export DATASET_DIR="~/datasets/nemotron-nas"
|
||||
python ./calibration_utils.py $DATASET_DIR # download and transform the recommended dataset.
|
||||
|
||||
python ../quantization/quantize.py \
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $MODEL_DIR \
|
||||
--output_dir $TRT_CHECKPOINT_DIR \
|
||||
--dtype bfloat16 \
|
||||
@ -116,7 +116,7 @@ The conversion script supports additional models with variable GQA, such as [Dec
|
||||
## Runtime
|
||||
|
||||
After you build the engine, you can use the engine with any TensorRT-LLM entrypoint or API.
|
||||
For example, you can run inference with [examples/run.py](../run.py):
|
||||
For example, you can run inference with [examples/run.py](../../../run.py):
|
||||
|
||||
```bash
|
||||
export MODEL_DIR="~/models/huggingface/nemotron-nas"
|
||||
@ -14,14 +14,14 @@ For multimodal models (Phi-3-vision-128k-instruct and Phi-3.5-vision-instruct),
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file:
|
||||
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/models/core/phi`](./) with a single file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
|
||||
@ -92,11 +92,11 @@ As previously explained, the first step is to build the TensorRT engine as descr
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
The summarization can be done using the [`../summarize.py`](../summarize.py) script as follows:
|
||||
The summarization can be done using the [`summarize.py`](../../../summarize.py) script as follows:
|
||||
|
||||
```bash
|
||||
# Run the summarization task using a TensorRT-LLM model and a single GPU.
|
||||
python3 ../summarize.py --engine_dir ./phi-engine \
|
||||
python3 ../../../summarize.py --engine_dir ./phi-engine \
|
||||
--hf_model_dir /path/to/phi-model \
|
||||
--batch_size 1 \
|
||||
--test_trt_llm \
|
||||
@ -107,7 +107,7 @@ python3 ../summarize.py --engine_dir ./phi-engine \
|
||||
|
||||
# Run the summarization task using a TensorRT-LLM model and 2-way tensor parallelism.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python3 ../summarize.py --engine_dir ./phi-engine-tp2 \
|
||||
python3 ../../../summarize.py --engine_dir ./phi-engine-tp2 \
|
||||
--hf_model_dir /path/to/phi-model \
|
||||
--batch_size 1 \
|
||||
--test_hf \
|
||||
@ -126,7 +126,7 @@ FP8 checkpoints can be built as follows:
|
||||
|
||||
```bash
|
||||
DTYPE=bfloat16
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--model_dir phi3-model \
|
||||
--output_dir ./phi3-checkpoint \
|
||||
--dtype $DTYPE \
|
||||
@ -137,7 +137,7 @@ INT8 checkpoints can be built as follows:
|
||||
|
||||
```bash
|
||||
DTYPE=bfloat16
|
||||
python3 ../quantization/quantize.py \
|
||||
python3 ../../../quantization/quantize.py \
|
||||
--model_dir phi3-model \
|
||||
--output_dir ./phi3-checkpoint \
|
||||
--dtype $DTYPE \
|
||||
@ -161,7 +161,7 @@ git-lfs clone https://huggingface.co/sikoraaxd/Phi-3-mini-4k-instruct-ru-lora
|
||||
* Quantize the Phi-3-mini model to fp8 from HF
|
||||
```bash
|
||||
BASE_PHI_3_MINI_MODEL=./Phi-3-mini-4k-instruct
|
||||
python ../quantization/quantize.py --model_dir ${BASE_PHI_3_MINI_MODEL} \
|
||||
python ../../../quantization/quantize.py --model_dir ${BASE_PHI_3_MINI_MODEL} \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -170,7 +170,7 @@ python ../quantization/quantize.py --model_dir ${BASE_PHI_3_MINI_MODEL} \
|
||||
```
|
||||
|
||||
* Build engine and run inference.
|
||||
```
|
||||
```bash
|
||||
trtllm-build --checkpoint_dir phi3_mini_4k_instruct/trt_ckpt/fp8/1-gpu \
|
||||
--output_dir phi3_mini_4k_instruct/trt_engines/fp8_lora/1-gpu \
|
||||
--gemm_plugin auto \
|
||||
@ -180,7 +180,7 @@ trtllm-build --checkpoint_dir phi3_mini_4k_instruct/trt_ckpt/fp8/1-gpu \
|
||||
--lora_plugin auto \
|
||||
--lora_dir ./Phi-3-mini-4k-instruct-ru-lora
|
||||
|
||||
python ../run.py --engine_dir phi3_mini_4k_instruct/trt_engines/fp8_lora/1-gpu \
|
||||
python ../../../run.py --engine_dir phi3_mini_4k_instruct/trt_engines/fp8_lora/1-gpu \
|
||||
--max_output_len 500 \
|
||||
--tokenizer_dir ./Phi-3-mini-4k-instruct-ru-lora \
|
||||
--input_text "<|user|>\nCan you provide ways to eat combinations of bananas and dragonfruits?<|end|>\n<|assistant|>" \
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -20,14 +20,14 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM Qwen implementation can be found in [models/qwen](../../tensorrt_llm/models/qwen/). The TensorRT-LLM Qwen example code is located in [`examples/qwen`](./). There is one main file:
|
||||
The TensorRT-LLM Qwen implementation can be found in [models/qwen](../../../../tensorrt_llm/models/qwen/). The TensorRT-LLM Qwen example code is located in [`examples/models/core/qwen`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Qwen model.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
| Model Name | FP16/BF16 | FP8 | WO | AWQ | GPTQ | SQ | TP | PP | Arch |
|
||||
@ -72,7 +72,7 @@ Currently Qwen1 models does not support dynamic NTK and logn attention. Therefor
|
||||
|
||||
## Usage
|
||||
|
||||
The TensorRT-LLM Qwen example code locates at [examples/qwen](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
|
||||
The TensorRT-LLM Qwen example code locates at [examples/models/core/qwen](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
|
||||
|
||||
### Download model weights
|
||||
|
||||
@ -257,7 +257,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
|
||||
|
||||
```bash
|
||||
# Quantize model into FP8 and export trtllm checkpoint
|
||||
python ../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
|
||||
python ../../../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -296,7 +296,7 @@ To run the AWQ Qwen example, the following steps are required:
|
||||
|
||||
```bash
|
||||
# Quantize Qwen-7B-Chat checkpoint into INT4 AWQ format
|
||||
python ../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
|
||||
python ../../../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
--awq_block_size 128 \
|
||||
@ -325,19 +325,19 @@ To run a TensorRT-LLM Qwen model using the engines generated by `trtllm-build`
|
||||
|
||||
```bash
|
||||
# With fp16 inference
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--engine_dir=./tmp/Qwen/7B/trt_engines/fp16/1-gpu/
|
||||
|
||||
# With bf16 inference
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--engine_dir=./tmp/Qwen/7B/trt_engines/bf16/1-gpu
|
||||
|
||||
# With int8 weight only inference
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--engine_dir=./tmp/Qwen/7B/trt_engines/int8_weight_only/1-gpu/
|
||||
@ -358,7 +358,7 @@ Output [Text 0 Beam 0]: "你好,我是来自阿里云的大规模语言模型
|
||||
|
||||
```bash
|
||||
# With int4 weight only inference
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--engine_dir=./tmp/Qwen/7B/trt_engines/int4_weight_only/1-gpu/
|
||||
@ -376,7 +376,7 @@ Output [Text 0 Beam 0]: "我叫通义千问,是由阿里云开发的预训练
|
||||
|
||||
```bash
|
||||
# With INT4 GPTQ quantization
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen-7B-Chat-Int4 \
|
||||
--engine_dir=./tmp/Qwen/7B/trt_engines/int4_GPTQ/1-gpu/
|
||||
@ -394,7 +394,7 @@ Output [Text 0 Beam 0]: "你好,我是通义千问,由阿里云开发。<|im
|
||||
|
||||
```bash
|
||||
# With INT4 AWQ quantization
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--engine_dir=./tmp/Qwen/7B/trt_engines/int4_AWQ/1-gpu/
|
||||
@ -413,7 +413,7 @@ Output [Text 0 Beam 0]: "你好,我是通义千问,由阿里云开发。<|im
|
||||
```bash
|
||||
# Run 72B model with 8-gpu
|
||||
mpirun -n 8 --allow-run-as-root \
|
||||
python ../run.py --input_text "What is your name?" \
|
||||
python ../../../run.py --input_text "What is your name?" \
|
||||
--max_output_len=50 \
|
||||
--tokenizer_dir ./tmp/Qwen/72B/ \
|
||||
--engine_dir=./tmp/Qwen/72B/trt_engines/fp16/8-gpu/
|
||||
@ -453,7 +453,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp16 \
|
||||
Run inference:
|
||||
|
||||
```bash
|
||||
python ../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
|
||||
python ../../../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--input_text "안녕하세요, 혹시 이름이 뭐에요?" \
|
||||
@ -477,7 +477,7 @@ In that case, the model will not run the LoRA module and the results will be
|
||||
different.
|
||||
|
||||
```bash
|
||||
python ../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
|
||||
python ../../../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
|
||||
--max_output_len 50 \
|
||||
--tokenizer_dir ./tmp/Qwen/7B/ \
|
||||
--input_text "안녕하세요, 혹시 이름이 뭐에요?" \
|
||||
@ -498,7 +498,7 @@ Output [Text 0 Beam 0]: "안녕하세요! 저는 "QianWen"입니다.<|im_end|>
|
||||
|
||||
```bash
|
||||
# Run summarization using the Qwen 7B model in FP16.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/7B/trt_engines/fp16/1-gpu/ \
|
||||
@ -506,7 +506,7 @@ python ../summarize.py --test_trt_llm \
|
||||
--output_len 2048
|
||||
|
||||
# Run summarization using the Qwen 7B model in BF16.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/7B/trt_engines/bf16/1-gpu/ \
|
||||
@ -514,7 +514,7 @@ python ../summarize.py --test_trt_llm \
|
||||
--output_len 2048
|
||||
|
||||
# Run summarization using the Qwen 7B model quantized to INT8.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/7B/trt_engines/int8_weight_only/1-gpu/ \
|
||||
@ -522,7 +522,7 @@ python ../summarize.py --test_trt_llm \
|
||||
--output_len 2048
|
||||
|
||||
# Run summarization using the Qwen 7B model quantized to INT4.
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/7B/trt_engines/int4_weight_only/1-gpu/ \
|
||||
@ -531,7 +531,7 @@ python ../summarize.py --test_trt_llm \
|
||||
|
||||
# Run summarization using the Qwen 7B model in FP16 using two GPUs.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/7B/trt_engines/fp16/2-gpu/ \
|
||||
@ -540,7 +540,7 @@ mpirun -n 2 --allow-run-as-root \
|
||||
|
||||
# Run summarization using the Qwen 14B model in FP16 using two GPUs.
|
||||
mpirun -n 2 --allow-run-as-root \
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/14B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/14B/trt_engines/fp16/2-gpu/ \
|
||||
@ -549,7 +549,7 @@ mpirun -n 2 --allow-run-as-root \
|
||||
```
|
||||
**Demo output of summarize.py:**
|
||||
```bash
|
||||
python ../summarize.py --test_trt_llm \
|
||||
python ../../../summarize.py --test_trt_llm \
|
||||
--hf_model_dir ./tmp/Qwen/7B/ \
|
||||
--data_type fp16 \
|
||||
--engine_dir ./tmp/Qwen/7B/trt_engines/fp16/1-gpu/ \
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -47,7 +47,7 @@
|
||||
The built Qwen engines are located in `${ENGINE_DIR}/llm`.
|
||||
|
||||
You can replace the `--checkpoint_dir` with INT8 Weight Only checkpoint to build INT8 Weight Only engine as well.
|
||||
For more information about Qwen, refer to the README.md in [`example/qwen`](../qwen).
|
||||
For more information about Qwen, refer to the README.md in [`example/models/core/qwen`](../qwen).
|
||||
|
||||
4. Assemble everything into the Qwen2-Audio pipeline.
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -25,7 +25,7 @@
|
||||
```
|
||||
2. Convert
|
||||
```bash
|
||||
python3 ./examples/qwen/convert_checkpoint.py --model_dir=./Qwen-VL-Chat \
|
||||
python3 ./examples/models/core/qwen/convert_checkpoint.py --model_dir=./Qwen-VL-Chat \
|
||||
--output_dir=./tllm_checkpoint_1gpu \
|
||||
--dtype float16
|
||||
```
|
||||
|
Before Width: | Height: | Size: 2.4 MiB After Width: | Height: | Size: 2.4 MiB |
|
Before Width: | Height: | Size: 485 KiB After Width: | Height: | Size: 485 KiB |
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
datasets==3.1.0
|
||||
evaluate
|
||||
@ -4,14 +4,14 @@ This document shows how to build and run a [RecurrentGemma](https://github.com/g
|
||||
|
||||
## Overview
|
||||
|
||||
The TensorRT-LLM RecurrentGemma implementation can be found in [`tensorrt_llm/models/recurrentgemma/model.py`](../../tensorrt_llm/models/recurrentgemma/model.py). The TensorRT-LLM RecurrentGemma example code is located in [`examples/recurrentgemma`](./). There is one main file:
|
||||
The TensorRT-LLM RecurrentGemma implementation can be found in [`tensorrt_llm/models/recurrentgemma/model.py`](../../../../tensorrt_llm/models/recurrentgemma/model.py). The TensorRT-LLM RecurrentGemma example code is located in [`examples/models/core/recurrentgemma`](./). There is one main file:
|
||||
|
||||
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the JAX format to the TensorRT-LLM format.
|
||||
|
||||
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
|
||||
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
|
||||
|
||||
* [`../run.py`](../run.py) to run the inference on an input text;
|
||||
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
* [`run.py`](../../../run.py) to run the inference on an input text;
|
||||
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
|
||||
|
||||
## Support Matrix
|
||||
| Checkpoint type | FP16 | BF16 | FP8 | INT8 SQ | INT4 AWQ | TP |
|
||||
@ -63,7 +63,7 @@ python convert_checkpoint.py --model_dir ${CKPT_2B_PATH} \
|
||||
# recurrentgemma-2b-it FP8 with FP8 kv cache
|
||||
CKPT_2B_IT_PATH=./recurrentgemma_model/recurrentgemma-2b-it
|
||||
UNIFIED_CKPT_2B_IT_FP8_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_ckpt/fp8/1-gpu/
|
||||
python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
python ../../../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
@ -73,7 +73,7 @@ python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
|
||||
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
|
||||
UNIFIED_CKPT_2B_IT_INT8_SQ_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_ckpt/int8_sq/1-gpu/
|
||||
python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
python ../../../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
--dtype float16 \
|
||||
--qformat int8_sq \
|
||||
--kv_cache_dtype int8 \
|
||||
@ -83,7 +83,7 @@ python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
|
||||
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
|
||||
UNIFIED_CKPT_2B_IT_INT4_AWQ_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_ckpt/int4_awq/1-gpu/
|
||||
python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
python ../../../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
|
||||
--dtype float16 \
|
||||
--qformat int4_awq \
|
||||
--kv_cache_dtype int8 \
|
||||
@ -182,7 +182,7 @@ Note that we need to download the dataset of MMLU first and the evaluation of MM
|
||||
```bash
|
||||
# recurrentgemma-2b
|
||||
TOKENIZER_DIR_2B_PATH=./recurrentgemma_model/recurrentgemma-2b
|
||||
python3 ../run.py --max_output_len=100 \
|
||||
python3 ../../../run.py --max_output_len=100 \
|
||||
--use_py_session \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_PATH} \
|
||||
@ -190,21 +190,21 @@ python3 ../run.py --max_output_len=100 \
|
||||
|
||||
# recurrentgemma-2b-it FP8 with FP8 kv cache
|
||||
TOKENIZER_DIR_2B_IT_PATH=./recurrentgemma_model/recurrentgemma-2b-it
|
||||
python3 ../run.py --max_output_len=100 \
|
||||
python3 ../../../run.py --max_output_len=100 \
|
||||
--use_py_session \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
|
||||
--engine_dir ${ENGINE_2B_IT_FP8_PATH}
|
||||
|
||||
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
|
||||
python3 ../run.py --max_output_len=100 \
|
||||
python3 ../../../run.py --max_output_len=100 \
|
||||
--use_py_session \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
|
||||
--engine_dir ${ENGINE_2B_IT_INT8_SQ_PATH}
|
||||
|
||||
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
|
||||
python3 ../run.py --max_output_len=100 \
|
||||
python3 ../../../run.py --max_output_len=100 \
|
||||
--use_py_session \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
|
||||
@ -212,7 +212,7 @@ python3 ../run.py --max_output_len=100 \
|
||||
|
||||
# recurrentgemma-2b-flax
|
||||
VOCAB_FILE_2B_FLAX_PATH=./recurrentgemma_model/recurrentgemma-2b-flax/tokenizer.model
|
||||
python3 ../run.py --max_output_len=100 \
|
||||
python3 ../../../run.py --max_output_len=100 \
|
||||
--use_py_session \
|
||||
--max_attention_window_size 2048 \
|
||||
--vocab_file ${VOCAB_FILE_2B_FLAX_PATH} \
|
||||
@ -220,7 +220,7 @@ python3 ../run.py --max_output_len=100 \
|
||||
|
||||
# recurrentgemma-2b-it-flax
|
||||
VOCAB_FILE_2B_IT_FLAX_PATH=./recurrentgemma_model/recurrentgemma-2b-it-flax/tokenizer.model
|
||||
python3 ../run.py --max_output_len=100 \
|
||||
python3 ../../../run.py --max_output_len=100 \
|
||||
--use_py_session \
|
||||
--max_attention_window_size 2048 \
|
||||
--vocab_file ${VOCAB_FILE_2B_IT_FLAX_PATH} \
|
||||
@ -231,7 +231,7 @@ python3 ../run.py --max_output_len=100 \
|
||||
|
||||
```bash
|
||||
# recurrentgemma-2b
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--use_py_session \
|
||||
--engine_dir ${ENGINE_2B_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -239,7 +239,7 @@ python3 ../summarize.py --test_trt_llm \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_PATH}
|
||||
|
||||
# recurrentgemma-2b-it FP8 with FP8 kv cache
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--use_py_session \
|
||||
--engine_dir ${ENGINE_2B_IT_FP8_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -247,7 +247,7 @@ python3 ../summarize.py --test_trt_llm \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH}
|
||||
|
||||
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--use_py_session \
|
||||
--engine_dir ${ENGINE_2B_IT_INT8_SQ_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -255,7 +255,7 @@ python3 ../summarize.py --test_trt_llm \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH}
|
||||
|
||||
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--use_py_session \
|
||||
--engine_dir ${ENGINE_2B_IT_INT4_AWQ_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -263,7 +263,7 @@ python3 ../summarize.py --test_trt_llm \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH}
|
||||
|
||||
# recurrentgemma-2b-flax
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--use_py_session \
|
||||
--engine_dir ${ENGINE_2B_FLAX_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -271,7 +271,7 @@ python3 ../summarize.py --test_trt_llm \
|
||||
--vocab_file ${VOCAB_FILE_2B_FLAX_PATH}
|
||||
|
||||
# recurrentgemma-2b-it-flax
|
||||
python3 ../summarize.py --test_trt_llm \
|
||||
python3 ../../../summarize.py --test_trt_llm \
|
||||
--use_py_session \
|
||||
--engine_dir ${ENGINE_2B_IT_FLAX_PATH} \
|
||||
--batch_size 8 \
|
||||
@ -294,37 +294,37 @@ Evaluate on MMLU dataset.
|
||||
|
||||
```bash
|
||||
# recurrentgemma-2b
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_PATH} \
|
||||
--engine_dir ${ENGINE_2B_PATH}
|
||||
|
||||
# recurrentgemma-2b-it FP8 with FP8 kv cache
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
|
||||
--engine_dir ${ENGINE_2B_IT_FP8_PATH}
|
||||
|
||||
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
|
||||
--engine_dir ${ENGINE_2B_IT_INT8_SQ_PATH}
|
||||
|
||||
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--max_attention_window_size 2048 \
|
||||
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
|
||||
--engine_dir ${ENGINE_2B_IT_INT4_AWQ_PATH}
|
||||
|
||||
# recurrentgemma-2b-flax
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--max_attention_window_size 2048 \
|
||||
--vocab_file ${VOCAB_FILE_2B_FLAX_PATH} \
|
||||
--engine_dir ${ENGINE_2B_FLAX_PATH}
|
||||
|
||||
# recurrentgemma-2b-it-flax
|
||||
python3 ../mmlu.py --test_trt_llm \
|
||||
python3 ../../../mmlu.py --test_trt_llm \
|
||||
--max_attention_window_size 2048 \
|
||||
--vocab_file ${VOCAB_FILE_2B_IT_FLAX_PATH} \
|
||||
--engine_dir ${ENGINE_2B_IT_FLAX_PATH}
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../constraints.txt
|
||||
-c ../../../constraints.txt
|
||||
tensorrt_llm>=0.0.0.dev0
|
||||
git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
|
||||
flax>=0.8.2
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user