move the reset models into examples/models/core directory (#3555)

* move rest models to examples/models/core directory

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* update multimodal readme

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix example path

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix cpp test

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix tensorrt test

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

---------

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
QI JUN 2025-04-19 20:48:59 -07:00 committed by GitHub
parent c35d2a7532
commit d51ae53940
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
136 changed files with 509 additions and 597 deletions

View File

@ -31,7 +31,7 @@ def build_engine(base_model_dir: _pl.Path, eagle_model_dir: _pl.Path,
engine_dir: _pl.Path, build_base_model: bool, *args):
if build_base_model:
checkpoint_path = "examples/llama/convert_checkpoint.py"
checkpoint_path = "examples/models/core/llama/convert_checkpoint.py"
else:
checkpoint_path = "examples/eagle/convert_checkpoint.py"

View File

@ -119,7 +119,7 @@ class Convert(RunCMDMixin):
def command(self):
args = self.args
return [
f'python examples/enc_dec/convert_checkpoint.py',
f'python examples/models/core/enc_dec/convert_checkpoint.py',
f'--model_type {args.model_type}',
f'--model_dir {args.hf_models_dir}',
f'--output_dir {args.trt_models_dir}',

View File

@ -37,7 +37,7 @@ def convert_ckpt(model_dir: str,
world_size: int = 1,
dtype: str = 'float16'):
convert_cmd = [
sys.executable, "examples/gpt/convert_checkpoint.py",
sys.executable, "examples/models/core/gpt/convert_checkpoint.py",
f"--model_dir={model_dir}", f"--output_dir={output_dir}",
f"--dtype={dtype}", f"--tp_size={world_size}"
] + list(args)

View File

@ -32,11 +32,12 @@ def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, convert_extra_args,
ckpt_dir = engine_dir / 'ckpt'
convert_cmd = [_sys.executable, "examples/llama/convert_checkpoint.py"
] + ([f'--model_dir={weight_dir}'] if weight_dir else []) + [
f'--output_dir={ckpt_dir}',
'--dtype=float16',
] + convert_extra_args
convert_cmd = [
_sys.executable, "examples/models/core/llama/convert_checkpoint.py"
] + ([f'--model_dir={weight_dir}'] if weight_dir else []) + [
f'--output_dir={ckpt_dir}',
'--dtype=float16',
] + convert_extra_args
run_command(convert_cmd)

View File

@ -31,12 +31,13 @@ import tensorrt_llm.bindings as _tb
def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
*args):
convert_args = [_sys.executable, "examples/mamba/convert_checkpoint.py"] + (
['--model_dir', str(weight_dir)] if weight_dir else []) + [
'--output_dir',
str(ckpt_dir),
'--dtype=float16',
]
convert_args = [
_sys.executable, "examples/models/core/mamba/convert_checkpoint.py"
] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
'--output_dir',
str(ckpt_dir),
'--dtype=float16',
]
run_command(convert_args)
build_args = ["trtllm-build"] + ['--checkpoint_dir',
str(ckpt_dir)] + [

View File

@ -32,7 +32,8 @@ import tensorrt_llm.bindings as _tb
def build_engine(weight_dir: _pl.Path, ckpt_dir: _pl.Path, engine_dir: _pl.Path,
*args):
convert_args = [
_sys.executable, "examples/recurrentgemma/convert_checkpoint.py"
_sys.executable,
"examples/models/core/recurrentgemma/convert_checkpoint.py"
] + (['--model_dir', str(weight_dir)] if weight_dir else []) + [
'--output_dir',
str(ckpt_dir),

View File

@ -11,7 +11,7 @@ class Run(RunCMDMixin):
for beam in args.beams_tuple:
ret.append((
mpi_run,
f'python3 examples/enc_dec/run.py --engine_dir {args.engines_dir}',
f'python3 examples/models/core/enc_dec/run.py --engine_dir {args.engines_dir}',
f'--engine_name {args.ckpt}',
f'--model_name "{args.hf_models_dir}"',
f'--max_new_tokens={args.max_new_tokens}',

View File

@ -1,22 +0,0 @@
{
"builder_config": {
"max_batch_size": 256,
"max_input_len": 512,
"name": "bert",
"precision": "float16",
"tensor_parallel": 1,
"use_refit": false
},
"plugin_config": {
"bert_attention_plugin": "float16",
"context_fmha_enabled": true,
"gemm_plugin": "float16",
"gpt_attention_plugin": false,
"identity_plugin": false,
"layernorm_plugin": false,
"layernorm_quantization_plugin": false,
"nccl_plugin": false,
"smooth_quant_gemm_plugin": false,
"weight_only_quant_matmul_plugin": false
}
}

View File

@ -1,22 +0,0 @@
{
"builder_config": {
"max_batch_size": 256,
"max_input_len": 512,
"name": "bert",
"precision": "float16",
"tensor_parallel": 1,
"use_refit": false
},
"plugin_config": {
"bert_attention_plugin": "float16",
"context_fmha_enabled": true,
"gemm_plugin": "float16",
"gpt_attention_plugin": false,
"identity_plugin": false,
"layernorm_plugin": false,
"layernorm_quantization_plugin": false,
"nccl_plugin": false,
"smooth_quant_gemm_plugin": false,
"weight_only_quant_matmul_plugin": false
}
}

View File

@ -1,22 +0,0 @@
{
"builder_config": {
"max_batch_size": 256,
"max_input_len": 512,
"name": "bert",
"precision": "float16",
"tensor_parallel": 1,
"use_refit": false
},
"plugin_config": {
"bert_attention_plugin": false,
"context_fmha_enabled": false,
"gemm_plugin": false,
"gpt_attention_plugin": false,
"identity_plugin": false,
"layernorm_plugin": false,
"layernorm_quantization_plugin": false,
"nccl_plugin": false,
"smooth_quant_gemm_plugin": false,
"weight_only_quant_matmul_plugin": false
}
}

View File

@ -1,22 +0,0 @@
{
"builder_config": {
"max_batch_size": 256,
"max_input_len": 512,
"name": "bert",
"precision": "float16",
"tensor_parallel": 1,
"use_refit": false
},
"plugin_config": {
"bert_attention_plugin": "float16",
"context_fmha_enabled": true,
"gemm_plugin": "float16",
"gpt_attention_plugin": false,
"identity_plugin": false,
"layernorm_plugin": false,
"layernorm_quantization_plugin": false,
"nccl_plugin": false,
"smooth_quant_gemm_plugin": false,
"weight_only_quant_matmul_plugin": false
}
}

View File

@ -25,7 +25,7 @@ We provide two styles of running DTM now: using TensorRT-LLM-BLS in Triton Infer
+ `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
```bash
cd examples/llama
cd examples/models/core/llama
export DRAFT_CKPT_PATH=/workspace/ckpt-draft
export TARGET_CKPT_PATH=/workspace/ckpt-target
export DRAFT_ENGINE_PATH=/workspace/engine-draft

View File

@ -4,7 +4,7 @@ This document explains how to build the BERT family, specifically [BERT](https:/
## Overview
The TensorRT-LLM BERT family implementation can be found in [`tensorrt_llm/models/bert/model.py`](../../tensorrt_llm/models/bert/model.py).
The TensorRT-LLM BERT family implementation can be found in [`tensorrt_llm/models/bert/model.py`](../../../../tensorrt_llm/models/bert/model.py).
The TensorRT-LLM BERT family example code is located in [`examples/bert`](./). There are two main files in that folder:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the BERT model into tensorrt-llm checkpoint format.

View File

@ -18,15 +18,15 @@ This document explains how to build the [C4AI Command-R](https://huggingface.co/
## Overview
The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../tensorrt_llm/models/commandr/model.py).
The TensorRT-LLM Command-R implementation can be found in [`tensorrt_llm/models/commandr/model.py`](../../../../tensorrt_llm/models/commandr/model.py).
The TensorRT-LLM Command-R example code is located in [`examples/commandr`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
@ -122,23 +122,23 @@ If the engines are built successfully, you will see output like (Command-R as th
```bash
# Run the default engine of Command-R on single GPU.
python3 ../run.py --max_output_len 50 \
python3 ../../../run.py --max_output_len 50 \
--tokenizer_dir command_r_v01 \
--engine_dir trt_engines/command_r_v01/fp16/1-gpu
# Run the default engine of Command-R on single GPU, using streaming output.
python3 ../run.py --max_output_len 50 \
python3 ../../../run.py --max_output_len 50 \
--tokenizer_dir command_r_v01 \
--engine_dir trt_engines/command_r_v01/fp16/1-gpu \
--streaming
# Run the default engine of Aya-23-8B on single GPU.
python3 ../run.py --max_output_len 50 \
python3 ../../../run.py --max_output_len 50 \
--tokenizer_dir aya_23_8B \
--engine_dir trt_engines/aya_23_8B/fp16/1-gpu
# Run the default engine of Aya-23-35B on single GPU.
python3 ../run.py --max_output_len 50 \
python3 ../../../run.py --max_output_len 50 \
--tokenizer_dir aya_23_35B \
--engine_dir trt_engines/aya_23_35B/fp16/1-gpu
```
@ -148,7 +148,7 @@ python3 ../run.py --max_output_len 50 \
```bash
# Run the Tensor Parallel 4 engine of Command-R+ on 4 GPUs.
mpirun -n 4 \
python ../run.py --max_output_len 50 \
python ../../../run.py --max_output_len 50 \
--tokenizer_dir command_r_plus \
--engine_dir trt_engines/command_r_plus/fp16/4-gpu
```
@ -165,7 +165,7 @@ Output [Text 0 Beam 0]: " chef in Paris and worked in the kitchens of the French
```bash
# Run the summarization of Command-R task.
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--hf_model_dir command_r_v01 \
--engine_dir trt_engines/command_r_v01/fp16/1-gpu
```
@ -201,7 +201,7 @@ trtllm-build --checkpoint_dir trt_ckpt/command_r_v01/int8_wo/1-gpu \
--output_dir trt_engines/command_r_v01/int8_wo/1-gpu
# Run inference.
python3 ../run.py --max_output_len 50 \
python3 ../../../run.py --max_output_len 50 \
--tokenizer_dir command_r_v01 \
--engine_dir trt_engines/command_r_v01/int8_wo/1-gpu
```

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate

View File

@ -27,7 +27,7 @@ This document shows how to build and run an Encoder-Decoder (Enc-Dec) model in T
## Overview
The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc_dec/model.py](../../tensorrt_llm/models/enc_dec/model.py). The TensorRT-LLM Enc-Dec example code is located in [`examples/enc_dec`](./):
The TensorRT-LLM Enc-Dec implementation can be found in [tensorrt_llm/models/enc_dec/model.py](../../../../tensorrt_llm/models/enc_dec/model.py). The TensorRT-LLM Enc-Dec example code is located in [`examples/enc_dec`](./):
* `trtllm-build` to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Enc-Dec model,
* [`run.py`](./run.py) to run the inference on an example input text.
@ -202,7 +202,7 @@ Different types of runtime are provided for encoder-decoder models. Following an
- Python runtime w/ Static Batching
- (NEW) C++ runtime w/ Paged KV Cache and Inflight Batching
Please refer to the documentation for the details of [paged kv cache](../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../docs/source/advanced/gpt-attention.md#inflight-batching).
Please refer to the documentation for the details of [paged kv cache](../../../../docs/source/advanced/gpt-attention.md#paged-kv-cache) and [inflight batching](../../../../docs/source/advanced/gpt-attention.md#inflight-batching).
#### Run C++ runtime
**Note: to use inflight batching and paged kv cache features in C++ runtime, please make sure you have set `--paged_kv_cache enable` (which is by default enabled) in the `trtllm-build` command of the decoder. Meanwhile, if using Python runtime, it is recommended to disable this flag by `--paged_kv_cache disable` to avoid any unnecessary overhead.**
@ -213,12 +213,12 @@ For good usability, Python binding of the C++ runtime is provided. You can use t
```python
# Inferencing via python binding of C++ runtime with inflight batching (IFB)
python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
python3 ../../../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
```
You can specify `--kv_cache_free_gpu_memory_fraction` to control the percentage of free GPU memory to be used by KV cache (by default 0.9), and `--cross_kv_cache_fraction` to control the percentage of KV cache to be used by cross attention (by default 0.5, and rest of the KV cache will be used by self attention).
For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
#### Run with Triton Backend
[Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/encoder_decoder.md) contains the tutorial on how to run encoder-decoder engines with Tritonserver.

View File

@ -2,7 +2,7 @@
This document shows how to build and run a [EXAONE](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) model in TensorRT-LLM.
The TensorRT-LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../tensorrt_llm/models/llama/model.py).
The TensorRT-LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
See the LLaMA example [`examples/llama`](../llama) for details.
- [EXAONE](#exaone)
@ -113,7 +113,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
```bash
# Build the EXAONE model using a single GPU and and apply FP8 quantization.
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
--qformat fp8 \
@ -134,7 +134,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
```bash
# Build the EXAONE model using a single GPU and and apply INT8 SmoothQuant.
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
--qformat int8_sq \
@ -154,7 +154,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
```bash
# Build the EXAONE model using a single GPU and and apply INT4 AWQ.
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
--qformat int4_awq \
@ -173,7 +173,7 @@ Please make sure your system contains a Hopper GPU before trying the commands be
```bash
# Build the EXAONE model using a single GPU and and apply W4A8 AWQ.
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
--qformat w4a8_awq \
@ -190,7 +190,7 @@ trtllm-build \
Test your engine with the [run.py](../run.py) script:
```bash
python3 ../run.py \
python3 ../../../run.py \
--input_text "When did the first world war end?" \
--max_output_len=100 \
--tokenizer_dir $HF_MODEL_DIR \
@ -198,13 +198,13 @@ python3 ../run.py \
# Run with 2 GPUs
mpirun -n 2 --allow-run-as-root \
python3 ../run.py \
python3 ../../../run.py \
--input_text "When did the first world war end?" \
--max_output_len=100 \
--tokenizer_dir $HF_MODEL_DIR \
--engine_dir trt_engines/exaone/fp16/2-gpu
python ../summarize.py \
python ../../../summarize.py \
--test_trt_llm \
--data_type fp16 \
--hf_model_dir $HF_MODEL_DIR \

View File

@ -91,7 +91,7 @@ Note that we need to download the dataset of MMLU first and the evaluation of MM
```bash
VOCAB_FILE_PATH=/tmp/models/gemma_nv/checkpoints/tmp_vocab.model
python3 ../run.py --engine_dir ${ENGINE_PATH} \
python3 ../../../run.py --engine_dir ${ENGINE_PATH} \
--max_output_len 30 \
--vocab_file ${VOCAB_FILE_PATH}
@ -102,7 +102,7 @@ Output [Text 0 Beam 0]: "chef in the renowned kitchens of Lyon. After honing his
* summarize.py
```bash
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
--max_ite 5 \
@ -132,7 +132,7 @@ mv data/data data/mmlu
Evaluate on MMLU dataset.
```bash
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH}
@ -156,7 +156,7 @@ UNIFIED_CKPT_PATH=/tmp/ckpt/hf/gemma/2b/1-gpu/
ENGINE_PATH=/tmp/engines/gemma/2B/bf16/1-gpu/
VOCAB_FILE_PATH=gemma-2b/
python3 ./examples/gemma/convert_checkpoint.py \
python3 ./convert_checkpoint.py \
--ckpt-type hf \
--model-dir ${CKPT_PATH} \
--dtype bfloat16 \
@ -170,7 +170,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--tokenizer_dir ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -218,7 +218,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -266,7 +266,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -311,7 +311,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -352,7 +352,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -382,7 +382,7 @@ UNIFIED_CKPT_PATH=/tmp/checkpoints/tmp_7b_it_tensorrt_llm/bf16/tp1/
ENGINE_PATH=/tmp/gemma/7B/bf16/1-gpu/
VOCAB_FILE_PATH=gemma-7b-pytorch/tokenizer.model
python3 ./examples/gemma/convert_checkpoint.py \
python3 ./convert_checkpoint.py \
--ckpt-type torch \
--model-dir ${CKPT_PATH} \
--dtype bfloat16 \
@ -396,13 +396,13 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
--max_ite 5
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH}
@ -439,7 +439,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -479,7 +479,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -529,7 +529,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -569,7 +569,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
@ -615,13 +615,13 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 8 \
--max_ite 5
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH}
@ -643,7 +643,7 @@ UNIFIED_CKPT_PATH=/tmp/checkpoints/tmp_1b_it_tensorrt_llm/bf16/tp1/
ENGINE_PATH=/tmp/gemma3/1b/bf16/1-gpu/
VOCAB_FILE_PATH=gemma-3-1b-it/tokenizer.model
python3 ./examples/gemma/convert_checkpoint.py \
python3 ./convert_checkpoint.py \
--ckpt-type hf \
--model-dir ${CKPT_PATH} \
--dtype bfloat16 \
@ -657,7 +657,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_seq_len 3100 \
--output_dir ${ENGINE_PATH}
python3 ./examples/summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_PATH} \
--engine_dir ${ENGINE_PATH} \
--batch_size 1 \
@ -685,8 +685,8 @@ Modelopt toolkit also provides quantization solutions. To enable it, have the la
#### Quantize Checkpoints
```
python ../quantization/quantize.py --model_dir ${HF_GEMMA_PATH} \
```bash
python ../../../quantization/quantize.py --model_dir ${HF_GEMMA_PATH} \
--dtype float16 \
--qformat ${QUANT_TYPE} \
--output_dir ${UNIFIED_CKPT_PATH} \
@ -697,7 +697,7 @@ HF_GEMMA_PATH can either be HF model card name or the downloaded model path. QUA
#### Build Engines
For fp8, build engines with:
```
```bash
trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--max_batch_size 8 \
--max_input_len 3000 \
@ -707,7 +707,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
For int4_awq and int8_sq, build engines with:
```
```bash
trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
--gemm_plugin auto \
--max_batch_size 8 \

View File

@ -1,5 +1,5 @@
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-c ../constraints.txt
-c ../../../constraints.txt
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"

View File

@ -26,15 +26,15 @@ This document explains how to build the [glm-4-9b](https://huggingface.co/THUDM/
## Overview
The TensorRT-LLM ChatGLM implementation can be found in [`tensorrt_llm/models/chatglm/model.py`](../../tensorrt_llm/models/chatglm/model.py).
The TensorRT-LLM ChatGLM implementation can be found in [`tensorrt_llm/models/chatglm/model.py`](../../../../tensorrt_llm/models/chatglm/model.py).
The TensorRT-LLM ChatGLM example code is located in [`examples/glm-4-9b`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
@ -154,7 +154,7 @@ If the engines are run successfully, you will see output like (glm-4-9b as the e
```bash
# Run the default engine of GLM-4-9B on single GPU, other model name is available if built.
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/fp16/1-gpu
@ -165,7 +165,7 @@ python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?"
```bash
# Run the Tensor Parallel 2 engine of glm_4_9b on two GPU, other model name is available if built.
mpirun -n 2 \
python ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
python ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/fp16/2-gpu
@ -186,7 +186,7 @@ Output [Text 0 Beam 0]: "There is no new information provided in the official do
```bash
# Run the summarization of glm_4_9b task, other model name is available if built.
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--hf_model_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/fp16/1-gpu
```
@ -208,7 +208,7 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/int8_wo/1-gpu \
--output_dir trt_engines/glm_4_9b/int8_wo/1-gpu
# Run inference.
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/int8_wo/1-gpu
@ -232,7 +232,7 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/sq/1-gpu \
--output_dir trt_engines/glm_4_9b/sq/1-gpu
# Run inference.
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/sq/1-gpu
@ -240,11 +240,11 @@ python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?"
### Activation-aware Weight Quantization (AWQ)
The [`../quantization/quantize.py`](../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
The [`quantize.py`](../../../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
```bash
# glm_4_9b: single gpu, int4 awq quantization
python ../quantization/quantize.py --model_dir glm_4_9b \
python ../../../quantization/quantize.py --model_dir glm_4_9b \
--dtype float16 \
--qformat int4_awq \
--output_dir trt_ckpt/glm_4_9b/int4_awq/1-gpu
@ -255,7 +255,7 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/int4_awq/1-gpu \
--output_dir trt_engines/glm_4_9b/int4_awq/1-gpu
# Run inference.
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/int4_awq/1-gpu
@ -263,11 +263,11 @@ python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?"
### FP8 Quantization
The [`../quantization/quantize.py`](../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
The [`quantize.py`](../../../quantization/quantize.py) script can be used to quantize the models and export TensorRT-LLM checkpoints.
```bash
# glm_4_9b: single gpu, fp8 quantization
python ../quantization/quantize.py --model_dir glm_4_9b \
python ../../../quantization/quantize.py --model_dir glm_4_9b \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -279,12 +279,8 @@ trtllm-build --checkpoint_dir trt_ckpt/glm_4_9b/fp8/1-gpu \
--output_dir trt_engines/glm_4_9b/fp8/1-gpu
# Run inference.
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
python3 ../../../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir glm_4_9b \
--engine_dir trt_engines/glm_4_9b/fp8/1-gpu
```
## Benchmark
* The TensorRT-LLM ChatGLM benchmark is located in [benchmarks/](../../benchmarks/README.md)

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate

View File

@ -37,14 +37,14 @@ This document explains how to build the [GPT](https://huggingface.co/gpt2) model
## Overview
The TensorRT-LLM GPT implementation can be found in [`tensorrt_llm/models/gpt/model.py`](../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM GPT example code is located in [`examples/gpt`](./). There is one main file:
The TensorRT-LLM GPT implementation can be found in [`tensorrt_llm/models/gpt/model.py`](../../../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM GPT example code is located in [`examples/models/core/gpt`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
* FP16
@ -161,7 +161,7 @@ You can build engine(s) using random weights, which is useful for benchmarking.
```bash
# Generate an 8-GPU GPT-175B float16 checkpoint config file.
python3 ../generate_checkpoint_config.py --architecture GPTForCausalLM \
python3 ../../../generate_checkpoint_config.py --architecture GPTForCausalLM \
--vocab_size 51200 \
--hidden_size 12288 \
--num_hidden_layers 96 \
@ -172,7 +172,7 @@ python3 ../generate_checkpoint_config.py --architecture GPTForCausalLM \
# Generate a 16-GPU GPT-530B float16 checkpoint config file.
python3 ../generate_checkpoint_config.py --architecture GPTForCausalLM \
python3 ../../../generate_checkpoint_config.py --architecture GPTForCausalLM \
--vocab_size 51200 \
--hidden_size 20480 \
--num_hidden_layers 105 \
@ -207,10 +207,10 @@ trtllm-build --model_config gpt_530b/trt_ckpt/fp16/16-gpu/config.json \
### 5. Run inference
#### Single node, single GPU
The [`../run.py`](../run.py) script can be used to run inference with the built engine(s).
The [`run.py`](../../../run.py) script can be used to run inference with the built engine(s).
```bash
python3 ../run.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
python3 ../../../run.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
--tokenizer_dir gpt2 \
--max_output_len 8
```
@ -222,13 +222,13 @@ Input [Text 0]: "Born in north-east France, Soyer trained as a"
Output [Text 0 Beam 0]: " chef before moving to London in the early"
```
The [`../summarize.py`](../summarize.py) script can run the built engines to summarize the articles from the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
The [`summarize.py`](../../../summarize.py) script can run the built engines to summarize the articles from the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
For each summary, the script can compute the
[ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)) scores and use the `ROUGE-1` score to validate the implementation.
By passing `--test_trt_llm` flag, the script will evaluate TensorRT-LLM engines. You may also pass `--test_hf` flag to evaluate the HF model.
```bash
python3 ../summarize.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
python3 ../../../summarize.py --engine_dir gpt2/trt_engines/fp16/1-gpu \
--hf_model_dir gpt2 \
--test_trt_llm \
--test_hf
@ -258,13 +258,13 @@ To run engines using multiple GPUs on a single node, you can use `mpirun` as:
```bash
mpirun -np 2 \
python3 ../run.py --engine_dir gpt2/trt_engines/fp16/2-gpu \
python3 ../../../run.py --engine_dir gpt2/trt_engines/fp16/2-gpu \
--tokenizer_dir gpt2 \
--max_output_len 8
# Note that GPT-175B is built with random weights, so the output will also be random
mpirun -np 8 \
python3 ../run.py --engine_dir gpt_175b/trt_engines/fp16/8-gpu \
python3 ../../../run.py --engine_dir gpt_175b/trt_engines/fp16/8-gpu \
--max_output_len 8
```
@ -293,7 +293,7 @@ srun --mpi=pmix \
--container-workdir <path> \
--output logs/tensorrt_llm_%t.out \
--error logs/tensorrt_llm_%t.error \
python3 -u ../run.py --engine_dir <engine_dir> --max_output_len 8
python3 -u ../../../run.py --engine_dir <engine_dir> --max_output_len 8
```
Then, submit the job using:
@ -482,11 +482,11 @@ trtllm-build --checkpoint_dir gpt2/trt_ckpt/int4-wo/1-gpu \
### FP8 Quantization
[`../quantization/quantize.py`](../quantization/quantize.py) can do FP8 quantization and/or FP8 kv cache quantization, and export TensorRT-LLM checkpoint.
[`quantize.py`](../../../quantization/quantize.py) can do FP8 quantization and/or FP8 kv cache quantization, and export TensorRT-LLM checkpoint.
```bash
# FP8 quantization with FP8 kv cache
python3 ../quantization/quantize.py --model_dir gpt2 \
python3 ../../../quantization/quantize.py --model_dir gpt2 \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -558,7 +558,7 @@ trtllm-build --checkpoint_dir granite/trt_ckpt/fp16/4-gpu \
# Run inference
mpirun -np 4 \
python3 ../run.py --engine_dir granite/trt_engines/fp16/4-gpu \
python3 ../../../run.py --engine_dir granite/trt_engines/fp16/4-gpu \
--tokenizer_dir granite \
--input_text "def print_hello_world():" \
--max_output_len 20
@ -585,7 +585,7 @@ trtllm-build --checkpoint_dir santacoder/trt_ckpt/fp16/4-gpu \
# Run inference
mpirun -np 4 \
python3 ../run.py --engine_dir santacoder/trt_engines/fp16/4-gpu \
python3 ../../../run.py --engine_dir santacoder/trt_engines/fp16/4-gpu \
--tokenizer_dir santacoder \
--input_text "def print_hello_world():" \
--max_output_len 20
@ -613,7 +613,7 @@ trtllm-build --checkpoint_dir starcoder/trt_ckpt/fp16/4-gpu \
# Run inference
mpirun -np 4 \
python3 ../run.py --engine_dir starcoder/trt_engines/fp16/4-gpu \
python3 ../../../run.py --engine_dir starcoder/trt_engines/fp16/4-gpu \
--tokenizer_dir starcoder \
--input_text "def print_hello_world():" \
--max_output_len 20
@ -638,7 +638,7 @@ git-lfs clone https://huggingface.co/KaQyn/peft-lora-starcoder2-15b-unity-copilo
* Quantize the StarCoder2 model to fp8 from HF
```bash
BASE_STARCODER2_MODEL=./starcoder2-15b
python ../quantization/quantize.py --model_dir ${BASE_STARCODER2_MODEL} \
python ../../../quantization/quantize.py --model_dir ${BASE_STARCODER2_MODEL} \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -647,14 +647,14 @@ python ../quantization/quantize.py --model_dir ${BASE_STARCODER2_MODEL} \
```
* Build engine and run inference.
```
```bash
trtllm-build --checkpoint_dir starcoder2-15b/trt_ckpt/fp8/1-gpu \
--output_dir starcoder2-15b/trt_engines/fp8_lora/1-gpu \
--gemm_plugin auto \
--lora_plugin auto \
--lora_dir ./peft-lora-starcoder2-15b-unity-copilot
python ../run.py --engine_dir starcoder2-15b/trt_engines/fp8_lora/1-gpu \
python ../../../run.py --engine_dir starcoder2-15b/trt_engines/fp8_lora/1-gpu \
--max_output_len 20 \
--tokenizer_dir ${BASE_STARCODER2_MODEL} \
--input_text "def print_hello_world():" \
@ -685,7 +685,7 @@ trtllm-build --checkpoint_dir gpt-next-2B/trt_ckpt/bf16/1-gpu \
--output_dir gpt-next-2B/trt_engines/bf16/1-gpu
# Run inference
python3 ../run.py --engine_dir gpt-next-2B/trt_engines/bf16/1-gpu \
python3 ../../../run.py --engine_dir gpt-next-2B/trt_engines/bf16/1-gpu \
--vocab_file gpt-next-2B/trt_ckpt/bf16/1-gpu/tokenizer.model \
--no_add_special_tokens \
--max_output_len 8
@ -718,7 +718,7 @@ It'll give you a summary of the different tasks in the table, that you can speci
Finally, you can run inference on pre-defined tokens:
```bash
python3 ../run.py --engine_dir gpt-next-8B/trt_engines/fp16/1-gpu \
python3 ../../../run.py --engine_dir gpt-next-8B/trt_engines/fp16/1-gpu \
--vocab_file gpt-next-8B/trt_ckpt/fp16/1-gpu/tokenizer.model \
--no_add_special_tokens \
--prompt_table_path email_composition.npy \
@ -752,7 +752,7 @@ trtllm-build --checkpoint_dir gpt-next-2B/trt_ckpt/fp16/1-gpu \
# Run inference directly from NeMo LoRA checkpoint
# --lora_task_ids correspond to the index of the models given with --lora_dir. -1 means no LoRA
python3 ../run.py --engine_dir gpt-next-2B/trt_engines/fp16/1-gpu \
python3 ../../../run.py --engine_dir gpt-next-2B/trt_engines/fp16/1-gpu \
--vocab_file gpt-next-2B/trt_ckpt/fp16/1-gpu/tokenizer.model \
--no_add_special_tokens \
--max_output_len 20 \

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate

View File

@ -2,7 +2,7 @@
This document shows how to build and run a [Granite 3.0](https://huggingface.co/collections/ibm-granite/granite-30-language-models-66fdb59bbb54785c3512114f) model in TensorRT-LLM.
The TensorRT-LLM Granite implementation is based on the LLaMA model, with Mixture of Experts (MoE) enabled. The implementation can be found in [`llama/model.py`](../../tensorrt_llm/models/llama/model.py). See the LLaMA example [`examples/llama`](../llama) for details.
The TensorRT-LLM Granite implementation is based on the LLaMA model, with Mixture of Experts (MoE) enabled. The implementation can be found in [`llama/model.py`](../../../../tensorrt_llm/models/llama/model.py). See the LLaMA example [`examples/models/core/llama`](../llama) for details.
- [Granite 3.0](#Granite)
- [Download model checkpoints](#download-model-checkpoints)
@ -46,12 +46,12 @@ python3 ../llama/convert_checkpoint.py --model_dir tmp/hf_checkpoints/${HF_MODEL
### FP8 PTQ
Notes:
- Currently quantize.py does not support Expert Parallelism (EP) mode yet. User should use `../llama/convert_checkpoint.py` and specify `--moe_ep_size 1` instead, if needed.
- TensorRT-LLM uses static quantization methods, which is expected to be faster at runtime as compared to dynamic quantization methods. This comes at a cost of an offline calibration step during quantization. `batch_size` and `calib_size` can be adjusted to shorten the calibration time. Please refer to `../quantization/README.md` for explanation.
- TensorRT-LLM uses static quantization methods, which is expected to be faster at runtime as compared to dynamic quantization methods. This comes at a cost of an offline calibration step during quantization. `batch_size` and `calib_size` can be adjusted to shorten the calibration time. Please refer to `../../../quantization/README.md` for explanation.
```bash
PREC_QUANT="fp8"
ENGINE="${HF_MODEL}_${PREC_QUANT}_tp${TP}"
python ../quantization/quantize.py --model_dir tmp/hf_checkpoints/${HF_MODEL} \
python ../../../quantization/quantize.py --model_dir tmp/hf_checkpoints/${HF_MODEL} \
--dtype ${PREC_RAW} \
--qformat ${PREC_QUANT} \
--kv_cache_dtype ${PREC_QUANT} \
@ -74,10 +74,10 @@ trtllm-build --checkpoint_dir ./tmp/tllm_checkpoints/${ENGINE} \
```
## Run Engine
Test your engine with the [run.py](../run.py) script:
Test your engine with the [run.py](../../../run.py) script:
```bash
mpirun -n ${TP} --allow-run-as-root python ../run.py --engine_dir ./tmp/trt_engines/${ENGINE} --tokenizer_dir tmp/hf_checkpoints/${HF_MODEL} --max_output_len 20 --input_text "The future of AI is"
mpirun -n ${TP} --allow-run-as-root python ../../../run.py --engine_dir ./tmp/trt_engines/${ENGINE} --tokenizer_dir tmp/hf_checkpoints/${HF_MODEL} --max_output_len 20 --input_text "The future of AI is"
```
For more usage examples see [`examples/llama/README.md`](../llama/README.md)
For more usage examples see [`examples/models/core/llama/README.md`](../llama/README.md)

View File

@ -5,16 +5,16 @@ This document shows how to build and run InternLM2 7B / 20B models in TensorRT-L
## Overview
The TensorRT-LLM InternLM2 implementation is based on the LLaMA model. The implementation can
be found in [model.py](../../tensorrt_llm/models/llama/model.py).
be found in [model.py](../../../../tensorrt_llm/models/llama/model.py).
The TensorRT-LLM InternLM2 example code lies in [`examples/internlm2`](./):
* [`convert_checkpoint.py`](./convert_checkpoint.py) converts the Huggingface Model of InternLM2 into TensorRT-LLM checkpoint.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
* FP16 / BF16
@ -23,7 +23,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
## Usage
The TensorRT-LLM InternLM2 example code locates at [examples/internlm](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
The TensorRT-LLM InternLM2 example code locates at [examples/models/core/internlm2](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
### Build TensorRT engine(s)
@ -47,7 +47,7 @@ Here're some examples:
# Build a single-GPU float16 engine from HF weights.
# gpt_attention_plugin is necessary in InternLM2.
# Try use_gemm_plugin to prevent accuracy issue.
cd examples/internlm2
cd examples/models/core/internlm2
# Convert the InternLM2 7B model using a single GPU and FP16.
python convert_checkpoint.py --model_dir ./internlm2-chat-7b/ \
@ -100,7 +100,7 @@ trtllm-build --checkpoint_dir ./internlm2-chat-7b/trt_engines/bf16/2-gpu/ \
Examples:
```bash
cd examples/internlm2
cd examples/models/core/internlm2
# For 7B models
python convert_checkpoint.py --model_dir ./internlm2-chat-7b \
@ -117,7 +117,7 @@ trtllm-build --checkpoint_dir ./internlm2-chat-7b/w8a16 \
```bash
cd examples/internlm2
cd examples/models/core/internlm2
# For 20B models
python convert_checkpoint.py --model_dir ./internlm2-chat-20b \
@ -138,33 +138,33 @@ To run a TensorRT-LLM InternLM2 model using the engines generated by `trtllm-bui
```bash
# InternLM2 7B with fp16
python ../run.py --max_output_len=120 \
python ../../../run.py --max_output_len=120 \
--input_text 'Tell me about yourself.' \
--tokenizer_dir ./internlm2-chat-7b/ \
--engine_dir=./internlm2-chat-7b/trt_engines/fp16/1-gpu/
# InternLM2 7B with bf16
python ../run.py --max_output_len=120 \
python ../../../run.py --max_output_len=120 \
--input_text 'Tell me about yourself.' \
--tokenizer_dir ./internlm2-chat-7b/ \
--engine_dir=./internlm2-chat-7b/trt_engines/bf16/1-gpu/
# InternLM2 7B with int8 weight only quantization
python ../run.py --max_output_len=120 \
python ../../../run.py --max_output_len=120 \
--input_text 'Tell me about yourself.' \
--tokenizer_dir ./internlm2-chat-7b/ \
--engine_dir=./internlm2-chat-7b/trt_engines/weight_only/1-gpu/
# InternLM2 7B with fp16 and tensor parallelism
mpirun -n 2 --allow-run-as-root \
python ../run.py --max_output_len=120 \
python ../../../run.py --max_output_len=120 \
--input_text 'Tell me about yourself.' \
--tokenizer_dir ./internlm2-chat-7b/ \
--engine_dir=./internlm2-chat-7b/trt_engines/fp16/2-gpu/
# InternLM2 20B with fp16 and tensor parallelism and pipeline parallelism
mpirun -n 4 --allow-run-as-root \
python ../run.py --max_output_len=120 \
python ../../../run.py --max_output_len=120 \
--input_text 'Tell me about yourself.' \
--tokenizer_dir ./internlm2-chat-7b/ \
--engine_dir=./internlm2-chat-7b/trt_engines/bf16/4-gpu/
@ -174,27 +174,27 @@ mpirun -n 4 --allow-run-as-root \
```bash
# Run summarization using the InternLM2 7B model in FP16.
python ../summarize.py --test_trt_llm --test_hf \
python ../../../summarize.py --test_trt_llm --test_hf \
--hf_model_dir ./internlm2-chat-7b/ \
--data_type fp16 \
--engine_dir ./engine_outputs
# Run summarization using the InternLM2 7B model quantized to w8a16.
python ../summarize.py --test_trt_llm --test_hf \
python ../../../summarize.py --test_trt_llm --test_hf \
--hf_model_dir ./internlm2-chat-7b/ \
--data_type fp16 \
--engine_dir ./engine_outputs
# Run summarization using the InternLM2 7B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm --test_hf \
python ../../../summarize.py --test_trt_llm --test_hf \
--hf_model_dir ./internlm2-chat-7b/ \
--data_type fp16 \
--engine_dir ./internlm2-chat-7b/trt_engines/fp16/2-gpu/
# Run summarization using the InternLM2 20B model in BF16 using 4 GPUs.
mpirun -n 4 --allow-run-as-root \
python ../summarize.py --test_trt_llm --test_hf \
python ../../../summarize.py --test_trt_llm --test_hf \
--hf_model_dir ./internlm2-chat-20b/ \
--data_type bf16 \
--engine_dir ./internlm2-chat-20b/trt_engines/bf16/4-gpu/

View File

@ -40,14 +40,14 @@ This document shows how to build and run a LLaMA model in TensorRT-LLM on both s
## Overview
The TensorRT-LLM LLaMA implementation can be found in [tensorrt_llm/models/llama/model.py](../../tensorrt_llm/models/llama/model.py). The TensorRT-LLM LLaMA example code is located in [`examples/llama`](./). There is one main file:
The TensorRT-LLM LLaMA implementation can be found in [tensorrt_llm/models/llama/model.py](../../../../tensorrt_llm/models/llama/model.py). The TensorRT-LLM LLaMA example code is located in [`examples/models/core/llama`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the LLaMA model into tensorrt-llm checkpoint format.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
* BF16/FP16
@ -63,7 +63,7 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
## Usage
The TensorRT-LLM LLaMA example code locates at [examples/llama](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
The TensorRT-LLM LLaMA example code locates at [examples/models/core/llama](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
### Build TensorRT engine(s)
@ -316,7 +316,7 @@ awk '{printf "%s\\n", $0} END {printf "\\nSummarize this story:"}' pg64317.txt >
# Notice, `--max_input_length <n>` is a convenience option to limit the input length for the data.
# It should be set to the maximum context length the model supports. Here the limit is set to 32K.
mpirun -n 8 --allow-run-as-root \
python ../run.py \
python ../../../run.py \
--max_output_len 128 \
--max_input_length 32768 \
--input_file pg64317_sanitized.txt \
@ -355,7 +355,7 @@ git-lfs clone https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-104
To evaluate the PPL of very long context, we need to enable `use_paged_context_fmha` and setup `max_num_tokens` to enable the chunked context inference, reducing the activation memory requirement. Also, we need to enable `gather_context_logits` to return the logits to compute the PPL.
```bash
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
--output_dir /tmp/llama-3-8B-1048k/trt_ckpts \
--dtype float16
@ -420,7 +420,7 @@ Prepare input data and run evaluation.
```bash
python examples/infinitebench/construct_synthetic_dataset.py --test_case build_kv_retrieval --test_level 0
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
--output_dir /tmp/llama-3-8B-1048k/trt_ckpts \
--dtype float16 \
--tp_size 1
@ -470,7 +470,7 @@ python examples/infinitebench/construct_synthetic_dataset.py --test_case build_p
```bash
git-lfs clone https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k/
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-8B-Instruct-Gradient-1048k/ \
--output_dir /tmp/llama-3-8B-1048k/trt_ckpts \
--dtype float16 \
--tp_size 4
@ -500,7 +500,7 @@ For the 70B model, at least 8 A100 80GB GPUs are required.
```bash
git-lfs clone https://huggingface.co/gradientai/Llama-3-70B-Instruct-Gradient-1048k/
python examples/llama/convert_checkpoint.py --model_dir ./Llama-3-70B-Instruct-Gradient-1048k/ \
python examples/models/core/llama/convert_checkpoint.py --model_dir ./Llama-3-70B-Instruct-Gradient-1048k/ \
--output_dir /tmp/llama-3-70B-1048k/trt_ckpts \
--dtype float16 \
--tp_size 8
@ -570,10 +570,10 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_int8_kv_wq \
--gemm_plugin auto
```
Test with `../summarize.py`:
Test with `summarize.py`:
```bash
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./llama-models/llama-7b-hf \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_weight_only/1-gpu \
@ -585,7 +585,7 @@ python ../summarize.py --test_trt_llm \
In addition, you can enable INT8 KV cache together with AWQ (per-group INT4 weight-only quantization)like the following command.
```bash
python ../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
python ../../../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
--output_dir ./tllm_checkpoint_1gpu_awq_int8_kv_cache \
--dtype float16 \
--qformat int4_awq \
@ -598,10 +598,10 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_awq_int8_kv_cache \
--gemm_plugin auto \
```
Test with `../summarize.py`:
Test with `summarize.py`:
```bash
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir /tmp/llama-7b-hf \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_int4_AWQ/1-gpu \
@ -652,7 +652,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
```bash
# Quantize HF LLaMA 70B into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir ./tmp/llama/70B \
python ../../../quantization/quantize.py --model_dir ./tmp/llama/70B \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -676,7 +676,7 @@ Experimental: use FP8 GEMV to optimize performance in FP8 small-batch-size cases
```bash
# Quantize HF LLaMA 7B into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
python ../../../quantization/quantize.py --model_dir /tmp/llama-7b-hf \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -711,7 +711,7 @@ AWQ/GPTQ examples below involves 2 steps:
```bash
# Quantize HF LLaMA 7B checkpoint into INT4 AWQ format
python ../quantization/quantize.py --model_dir ./tmp/llama-7b-hf \
python ../../../quantization/quantize.py --model_dir ./tmp/llama-7b-hf \
--dtype float16 \
--qformat int4_awq \
--awq_block_size 128 \
@ -868,12 +868,12 @@ To run a TensorRT-LLM LLaMA model using the engines generated by `trtllm-build`
```bash
# With fp16 inference
python3 ../run.py --max_output_len=50 \
python3 ../../../run.py --max_output_len=50 \
--tokenizer_dir ./tmp/llama/7B/ \
--engine_dir=./tmp/llama/7B/trt_engines/fp16/1-gpu/
# With bf16 inference
python3 ../run.py --max_output_len=50 \
python3 ../../../run.py --max_output_len=50 \
--tokenizer_dir ./tmp/llama/7B/ \
--engine_dir=./tmp/llama/7B/trt_engines/bf16/1-gpu/
```
@ -910,7 +910,7 @@ To run the LLaMA 70B model on 2 nodes via Slurm, you need to prepare a Slurm scr
srun --container-image=<docker-image> \
--mpi=pmix \
... \ # more srun options here
python3 ../run.py --max_output_len=50 \
python3 ../../../run.py --max_output_len=50 \
--tokenizer_dir ./tmp/llama/70B/hf/ \
--engine_dir=./tmp/llama/70B/trt_engines/fp16/16-gpu/
```
@ -923,27 +923,27 @@ Considering the Slurm or other cluster management systems may be highly customiz
```bash
# Run summarization using the LLaMA 7B model in FP16.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/7B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/fp16/1-gpu/
# Run summarization using the LLaMA 7B model quantized to INT8.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/7B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/weight_only/1-gpu/
# Run summarization using the LLaMA 7B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/7B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/7B/trt_engines/fp16/2-gpu/
# Run summarization using the LLaMA 30B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/llama/30B/ \
--data_type fp16 \
--engine_dir ./tmp/llama/30B/trt_engines/fp16/2-gpu/
@ -965,7 +965,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_mistral \
--max_input_len 32256
# Run Mistral 7B fp16 inference with sliding window/cache size 4096
python ../run.py --max_output_len=50 \
python ../../../run.py --max_output_len=50 \
--tokenizer_dir ./mistral-7b-v0.1 \
--engine_dir=./tmp/mistral/7B/trt_engines/fp16/1-gpu/ \
--max_attention_window_size=4096
@ -994,7 +994,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_mistral_nemo \
--max_input_len 10240
# Run summarization using the Mistral Nemo model quantized to INT8.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./Mistral-Nemo-Instruct-2407 \
--data_type bf16 \
--engine_dir ./tmp/mistral_nemo/trt_engines/bf16/1-gpu//
@ -1024,7 +1024,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
```bash
# Quantize HF CodeLlama 7B into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir /tmp/CodeLlama-7b-Instruct-hf \
python ../../../quantization/quantize.py --model_dir /tmp/CodeLlama-7b-Instruct-hf \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -1070,13 +1070,13 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_4gpu_codellama \
### Run
Use the following command to run the 7b engine from above:
```
```bash
python ../run.py --max_output_len=40 --tokenizer_dir . --engine_dir codellama_7b --input_text "In Bash, how do I list all text files?"
```
Use the following command to run the 34b engine with long input/output from above:
```
```bash
mpirun -n 8 --allow-run-as-root \
python ../run.py --max_output_len=160 --tokenizer_dir ./CodeLlama-34b-Instruct \
python ../../../run.py --max_output_len=160 --tokenizer_dir ./CodeLlama-34b-Instruct \
--engine_dir codellama_34b --input_text "In python, write a function for binary searching an element in an integer array."
```
@ -1110,7 +1110,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_2gpu \
Run inference. Remember to use lora tokenizer because lora model has larger vocab size.
```bash
mpirun -n 2 python ../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
mpirun -n 2 python ../../../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
--max_output_len 50 \
--tokenizer_dir "chinese-llama-2-lora-13b/" \
--input_text "今天天气很好,我到公园的时候," \
@ -1128,7 +1128,7 @@ different. Since the LoRA tokenizer, embedding and LM head are still used,
the results will also be different with vanilla LLaMA and significantly degrade compared with `--lora_task_uids 0`.
```bash
mpirun -n 2 python ../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
mpirun -n 2 python ../../../run.py --engine_dir "/tmp/new_lora_13b/trt_engines/fp16/2-gpu/" \
--max_output_len 50 \
--tokenizer_dir "chinese-llama-2-lora-13b/" \
--input_text "今天天气很好,我到公园的时候," \
@ -1178,7 +1178,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu \
--max_lora_rank 8 \
--lora_target_modules attn_q attn_k attn_v
python ../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/" \
python ../../../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/" \
--max_output_len 10 \
--tokenizer_dir ${BASE_LLAMA_MODEL} \
--input_text "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" \
@ -1228,7 +1228,7 @@ git-lfs clone https://huggingface.co/davidkim205/komt-mistral-7b-v1-lora
* Quantize the Mistral v0.1 model to fp8 from HF
```bash
BASE_MISTRAL_MODEL=komt-mistral-7b-v1/
python ../quantization/quantize.py --model_dir ${BASE_MISTRAL_MODEL} \
python ../../../quantization/quantize.py --model_dir ${BASE_MISTRAL_MODEL} \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -1247,7 +1247,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8 \
--max_seq_len 33280 \
--lora_dir ./komt-mistral-7b-v1-lora
python ../run.py --max_output_len=1024 \
python ../../../run.py --max_output_len=1024 \
--tokenizer_dir ./komt-mistral-7b-v1 \
--engine_dir=/tmp/mistral_komt_lora/7B/trt_engines/fp8/1-gpu/ \
--input_text "[INST]오늘은 날씨가 아주 좋다 내가 공원에 갔을 때 [/INST]" \
@ -1274,7 +1274,7 @@ TensorRT-LLM can also support Quantized base model + FP16/BF16 LoRA. We can firs
* Quantize the llama model to INT4-AWQ from HF
```bash
BASE_LLAMA_MODEL=llama-7b-hf/
python ../quantization/quantize.py --model_dir ${BASE_LLAMA_MODEL} \
python ../../../quantization/quantize.py --model_dir ${BASE_LLAMA_MODEL} \
--output_dir ./tllm_checkpoint_1gpu_awq \
--dtype float16 \
--qformat int4_awq \
@ -1298,7 +1298,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_awq \
--max_lora_rank 8 \
--lora_target_modules attn_q attn_k attn_v
python ../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/int4_AWQ/1-gpu/" \
python ../../../run.py --engine_dir "/tmp/llama_7b_with_lora_qkv/trt_engines/int4_AWQ/1-gpu/" \
--max_output_len 10 \
--tokenizer_dir ${BASE_LLAMA_MODEL} \
--input_text "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" \
@ -1353,7 +1353,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_streamlingllm \
```bash
# Run LLaMA 7B fp16 inference with sliding window/cache size 2048 and sink token length 4.
python3 ../run.py --max_output_len=50 \
python3 ../../../run.py --max_output_len=50 \
--tokenizer_dir ./tmp/llama/7B/ \
--engine_dir=./tmp/llama/7B/trt_engines/fp16_StreamingLLM/1-gpu/ \
--max_attention_window_size=2048 \
@ -1377,7 +1377,7 @@ Note: For 405B HF model cloned before 09 Aug 2024, there are duplicated kv head
```bash
# Run BF16 model by BF16
python examples/llama/convert_checkpoint.py --meta_ckpt_dir llama_3.1_405B_meta_model/ \
python examples/models/core/llama/convert_checkpoint.py --meta_ckpt_dir llama_3.1_405B_meta_model/ \
--output_dir llama_3.1_405B_meta_model/trt_ckpts/tp8-pp2/ \
--dtype bfloat16 \
--tp_size 8 \
@ -1386,7 +1386,7 @@ python examples/llama/convert_checkpoint.py --meta_ckpt_dir llama_3.1_405B_meta_
--workers 2
# Run BF16 model by FP8
python examples/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_model/ \
python examples/models/core/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_model/ \
--output_dir llama_3.1_405B_HF_model/trt_ckpts/tp8-pp1/ \
--dtype bfloat16 \
--use_fp8_rowwise \
@ -1400,7 +1400,7 @@ python examples/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_model/
# Optionally enable --use_meta_fp8_rowwise_recipe to strictly follow the original Meta's LLaMA 3.1 recipe:
# (1) Skip quantization for the first and last Transformer layers
# (2) Skip quantization for the Attention layers
python examples/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_FP8_model/ \
python examples/models/core/llama/convert_checkpoint.py --model_dir llama_3.1_405B_HF_FP8_model/ \
--output_dir llama_3.1_405B_HF_FP8_model/trt_ckpts/tp8-pp1/ \
--dtype bfloat16 \
--tp_size 8 \

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers>=4.43.0
datasets==3.1.0

View File

@ -13,14 +13,14 @@ This document shows how to build and run a [Mamba](https://github.com/state-spac
## Overview
The TensorRT-LLM Mamba implementation can be found in [`tensorrt_llm/models/mamba/model.py`](../../tensorrt_llm/models/mamba/model.py). The TensorRT-LLM Mamba example code is located in [`examples/mamba`](./). There is one main file:
The TensorRT-LLM Mamba implementation can be found in [`tensorrt_llm/models/mamba/model.py`](../../../../tensorrt_llm/models/mamba/model.py). The TensorRT-LLM Mamba example code is located in [`examples/models/core/mamba`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
@ -182,35 +182,35 @@ The following section describes how to run a TensorRT-LLM Mamba model to summari
```bash
# mamba-2.8b
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./mamba_model/mamba-2.8b/ \
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
--data_type bf16 \
--engine_dir ./mamba_model/mamba-2.8b/trt_engines/bf16/1-gpu/
# mamba-130m
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./mamba_model/mamba-130m/ \
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
--data_type fp16 \
--engine_dir ./mamba_model/mamba-130m/trt_engines/fp16/1-gpu/
# mamba2-2.7b
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./mamba_model/mamba2-2.7b/ \
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
--data_type fp16 \
--engine_dir ./mamba_model/mamba2-2.7b/trt_engines/fp16/1-gpu/
# mamba2-130m
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./mamba_model/mamba2-130m/ \
--tokenizer_dir ./mamba_model/gpt-neox-20b/ \
--data_type fp16 \
--engine_dir ./mamba_model/mamba2-130m/trt_engines/fp16/1-gpu/
# mamba-codestral-7B-v0.1
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
--tokenizer_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
--data_type fp16 \
@ -218,7 +218,7 @@ python ../summarize.py --test_trt_llm \
# mamba-codestral-7B-v0.1 with 2-way tensor parallelism.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
--tokenizer_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
--data_type fp16 \

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers>=4.39.0
datasets==3.1.0

View File

@ -7,8 +7,8 @@ sufficient.
## Overview
The TensorRT-LLM Mixtral implementation is based on the LLaMA model, with Mixture of Experts enabled. The implementation can
be found in [tensorrt_llm/models/llama/model.py](../../tensorrt_llm/models/llama/model.py).
See the LLaMA example [`examples/llama`](../llama) for details.
be found in [tensorrt_llm/models/llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
See the LLaMA example [`examples/models/core/llama`](../llama) for details.
### Build TensorRT engine(s)
@ -74,13 +74,13 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_8gpu \
--gemm_plugin float16
```
Then, you can test your engine with the [run.py](../run.py) script:
Then, you can test your engine with the [run.py](../../../run.py) script:
```bash
mpirun -n 2 python3 ../run.py --engine_dir ./trt_engines/mixtral/tp2 --tokenizer_dir ./Mixtral-8x7B-v0.1 --max_output_len 8 --input_text "I love french quiche"
mpirun -n 2 python3 ../../../run.py --engine_dir ./trt_engines/mixtral/tp2 --tokenizer_dir ./Mixtral-8x7B-v0.1 --max_output_len 8 --input_text "I love french quiche"
```
For more examples see [`examples/llama/README.md`](../llama/README.md)
For more examples see [`examples/models/core/llama/README.md`](../llama/README.md)
### Parallelism Modes
@ -129,7 +129,7 @@ of the different top-k values.
- 2 (SPARSE_MIXER) corresponds to: `scales = sparsemixer(routing values)`
Mixtral uses `RENORM` mode, this is set as the default. To use a different mode use the `--moe_normalization_mode` flag.
See [tensorrt_llm/layers/moe.py](../../tensorrt_llm/layers/moe.py#L56) for available values
See [tensorrt_llm/layers/moe.py](../../../../tensorrt_llm/layers/moe.py#L56) for available values
## Quantization
@ -153,11 +153,11 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_2gpu \
### FP8 Post-Training Quantization
Mixtral supports FP8 quantization, using Modelopt. See [`examples/llama/README.md`](../llama/README.md#fp8-post-training-quantization) for full details on installing Modelopt
Mixtral supports FP8 quantization, using Modelopt. See [`examples/models/core/llama/README.md`](../llama/README.md#fp8-post-training-quantization) for full details on installing Modelopt
```bash
# Quantize HF Mixtral into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
python ../../../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -194,7 +194,7 @@ Mixtral supports NVFP4 quantization.
```bash
# Quantize HF Mixtral into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
python ../../../quantization/quantize.py --model_dir ./Mixtral-8x7B-v0.1 \
--dtype float16 \
--qformat nvfp4 \
--kv_cache_dtype fp8 \

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
transformers==4.38.2
accelerate==0.25.0

View File

@ -62,7 +62,7 @@ Not all models supports end-to-end `cpp` mode, the checked ones below are suppor
This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed when switching the LLM backbone.
1. Download Huggingface weights and convert original checkpoint to TRT-LLM checkpoint format
following example in `examples/opt/README.md` and `examples/enc_dec/README.md`.
following example in `examples/models/contrib/opt/README.md` and `examples/models/core/enc_dec/README.md`.
```bash
export MODEL_NAME="blip2-opt-2.7b" # options: blip2-opt-6.7b, blip2-flan-t5-xl, blip2-flan-t5-xxl
@ -71,7 +71,7 @@ This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed
For BLIP2-OPT family,
```bash
python ../opt/convert_checkpoint.py --model_type blip2 \
python ../../contrib/opt/convert_checkpoint.py --model_type blip2 \
--model_dir tmp/hf_models/${MODEL_NAME} \
--output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \
--dtype float16
@ -168,7 +168,7 @@ This BLIP section covers both BLIP2-OPT and BLIP2-T5, with minor changes needed
5. (Optional) INT8/INT4 weight-only quantization for OPT can be enabled using commands as follows (take `INT4` as an example, while `INT8` is the default precision for weight-only quantization):
```bash
python ../opt/convert_checkpoint.py \
python ../../contrib/opt/convert_checkpoint.py \
--model_dir tmp/hf_models/${MODEL_NAME} \
--dtype float16 \
--output_dir tmp/trt_models/${MODEL_NAME}/int4_weightonly/1-gpu \
@ -216,7 +216,7 @@ Currently, CogVLM only support bfloat16 precision.
CogVLM uses a Vit encoder as LLM encoder and a modified Llama as decoder.
```bash
python ../cogvlm/convert_checkpoint.py --model_dir tmp/hf_models/${MODEL_NAME} --output_dir tmp/trt_models/${MODEL_NAME} --dtype bfloat16 --use_prompt_tuning
python ../../contrib/cogvlm/convert_checkpoint.py --model_dir tmp/hf_models/${MODEL_NAME} --output_dir tmp/trt_models/${MODEL_NAME} --dtype bfloat16 --use_prompt_tuning
trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME} \
--output_dir tmp/trt_engines/${MODEL_NAME}/bf16/1-gpu/llm \
@ -461,7 +461,7 @@ Firstly, please install transformers with 4.37.2
```bash
# FP8 quantization
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir tmp/hf_models/${MODEL_NAME} \
--output_dir tmp/trt_models/${MODEL_NAME}/fp8/1-gpu \
--dtype bfloat16 \
@ -469,7 +469,7 @@ Firstly, please install transformers with 4.37.2
--kv_cache_dtype fp8
# INT8 SmoothQuant quantization
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir tmp/hf_models/${MODEL_NAME} \
--output_dir tmp/trt_models/${MODEL_NAME}/int8/1-gpu \
--dtype bfloat16 \
@ -710,7 +710,7 @@ Firstly, please install transformers with 4.37.2
--weight_only_precision int4
# INT4 AWQ
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir tmp/hf_models/${MODEL_NAME} \
--output_dir tmp/trt_models/${MODEL_NAME}/int4_awq/1-gpu \
--dtype float16 \
@ -737,7 +737,7 @@ For LLaMA-3.2 text model, please refer to the [examples/llama/README.md](../llam
* build engine of vision encoder model
```bash
python examples/multimodal/build_multimodal_engine.py --model_type mllama \
python examples/models/core/multimodal/build_multimodal_engine.py --model_type mllama \
--model_path Llama-3.2-11B-Vision/ \
--output_dir /tmp/mllama/trt_engines/vision/
```
@ -745,7 +745,7 @@ python examples/multimodal/build_multimodal_engine.py --model_type mllama \
* build engine of decoder model
```bash
python examples/mllama/convert_checkpoint.py --model_dir Llama-3.2-11B-Vision/ \
python examples/models/core/mllama/convert_checkpoint.py --model_dir Llama-3.2-11B-Vision/ \
--output_dir /tmp/mllama/trt_ckpts \
--dtype bfloat16
@ -765,7 +765,7 @@ Note that for instruct Vision model, please set the `max_encoder_input_len` as `
* Run test on multimodal/run.py with C++ runtime (LLM part only)
```bash
python3 examples/multimodal/run.py --engine_dir /tmp/mllama/trt_engines/ \
python3 examples/models/core/multimodal/run.py --engine_dir /tmp/mllama/trt_engines/ \
--hf_model_dir Llama-3.2-11B-Vision/ \
--image_path https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg \
--input_text "<|image|><|begin_of_text|>If I had to write a haiku for this one" \
@ -774,7 +774,7 @@ python3 examples/multimodal/run.py --engine_dir /tmp/mllama/trt_engines/ \
Use model_runner_cpp by default. To switch to model_runner, set `--session python` in the command mentioned above.
python3 examples/multimodal/eval.py \
python3 examples/models/core/multimodal/eval.py \
--engine_dir /tmp/mllama/trt_engines/ \
--hf_model_dir Llama-3.2-11B-Vision/ \
--test_trtllm \
@ -810,14 +810,14 @@ trtllm-build --checkpoint_dir /tmp/llama-3.2-11B-Vision/fp8/ \
# copy visiual engine directory `/tmp/mllama/trt_engines/vision/` to fp8 engine directory `/tmp/trt_engines/llama-3.2-11B-Vision/fp8/vision`
python3 examples/multimodal/run.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
python3 examples/models/core/multimodal/run.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
--hf_model_dir Llama-3.2-11B-Vision/ \
--image_path https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg \
--input_text "<|image|><|begin_of_text|>If I had to write a haiku for this one" \
--max_new_tokens 50 \
--batch_size 2
python3 examples/multimodal/eval.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
python3 examples/models/core/multimodal/eval.py --engine_dir /tmp/trt_engines/llama-3.2-11B-Vision/fp8/ \
--hf_model_dir Llama-3.2-11B-Vision/ \
--test_trtllm \
--accuracy_threshold 65 \
@ -1049,7 +1049,7 @@ pip install -r requirements-qwen2vl.txt
```bash
pip install decord # used for loading video
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--nemo_ckpt_path /path/to/nemotron/model.nemo \
--dtype bfloat16 \
--batch_size 64 \

View File

@ -14,12 +14,12 @@ This document demonstrates how to build the Nemotron models using TensorRT-LLM a
## Overview
The TensorRT-LLM Nemotron implementation is based on the GPT model, which can be found in [`tensorrt_llm/models/gpt/model.py`](../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM Nemotron example is located in [`examples/nemotron`](./).
The TensorRT-LLM Nemotron implementation is based on the GPT model, which can be found in [`tensorrt_llm/models/gpt/model.py`](../../../../tensorrt_llm/models/gpt/model.py). The TensorRT-LLM Nemotron example is located in [`examples/models/core/nemotron`](./).
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
* FP16/BF16
@ -61,7 +61,7 @@ git clone https://huggingface.co/nvidia/nemotron-3-8b-chat-4k-rlhf
```
### Build TensorRT engine(s)
The [`examples/quantization/quantize.py`](../quantization/quantize.py) script can quantize the Nemotron models and export to TensorRT-LLM checkpoints. You may optionally skip the quantization step by specifying `--qformat full_prec` and thus export float16 or bfloat16 TensorRT-LLM checkpoints.
The [`examples/quantization/quantize.py`](../../../quantization/quantize.py) script can quantize the Nemotron models and export to TensorRT-LLM checkpoints. You may optionally skip the quantization step by specifying `--qformat full_prec` and thus export float16 or bfloat16 TensorRT-LLM checkpoints.
The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpoints. The number of engine files is same to the number of GPUs used to run inference. Normally, `trtllm-build` uses one GPU by default, but if you have already more GPUs available at build time, you may enable parallel builds to make the engine building process faster by adding the `--workers` argument.
@ -69,7 +69,7 @@ Here are some examples:
```bash
# single gpu, dtype bfloat16
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
--dtype bfloat16 \
--batch_size 64 \
@ -84,7 +84,7 @@ trtllm-build --checkpoint_dir nemotron-3-8b/trt_ckpt/bf16/1-gpu \
```bash
# 2-way tensor parallelism
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
--dtype bfloat16 \
--batch_size 64 \
@ -102,7 +102,7 @@ trtllm-build --checkpoint_dir nemotron-3-8b/trt_ckpt/bf16/tp2 \
```bash
# 2-way tensor parallelism for both calibration and inference
mpirun -np 2 \
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
--dtype bfloat16 \
--batch_size 64 \
@ -124,7 +124,7 @@ Quantize the Nemotron models to FP8 by specifying `--qformat fp8` to `quantize.p
```bash
# single gpu, fp8 quantization
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
--dtype bfloat16 \
--batch_size 64 \
@ -142,7 +142,7 @@ Quantize the Nemotron models using INT4 AWQ by specifying `--qformat int4_awq` t
```bash
# single gpu, int4 awq quantization
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--nemo_ckpt_path nemotron-3-8b-base-4k/Nemotron-3-8B-Base-4k.nemo \
--dtype bfloat16 \
--batch_size 64 \
@ -156,19 +156,19 @@ trtllm-build --checkpoint_dir nemotron-3-8b/trt_ckpt/int4_awq/1-gpu \
### Run Inference
The `../summarize.py` script can run the built engines to summarize the articles from the
The `summarize.py` script can run the built engines to summarize the articles from the
[cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
```bash
# single gpu
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--no_add_special_tokens \
--engine_dir nemotron-3-8b/trt_engines/bf16/1-gpu \
--vocab_file nemotron-3-8b/trt_ckpt/bf16/1-gpu/tokenizer.model
# multiple gpus
mpirun -np 2 \
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--no_add_special_tokens \
--engine_dir nemotron-3-8b/trt_engines/bf16/tp2 \
--vocab_file nemotron-3-8b/trt_ckpt/bf16/tp2/tokenizer.model
@ -207,7 +207,7 @@ trtllm-build --checkpoint_dir minitron/trt_ckpt/bf16/1-gpu \
--output_dir minitron/trt_engines/bf16/1-gpu
# Run inference
python3 ../run.py --engine_dir minitron/trt_engines/bf16/1-gpu \
python3 ../../../run.py --engine_dir minitron/trt_engines/bf16/1-gpu \
--tokenizer_dir Minitron-4B-Base \
--input_text "def print_hello_world():" \
--max_output_len 20

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
nemo-toolkit[all]==2.0.0rc1
megatron-core @ git+https://github.com/NVIDIA/Megatron-LM@core_r0.8.0

View File

@ -12,7 +12,7 @@ This document shows how to convert and build a model generated by Nemotron-NAS,
## Overview
The TensorRT-LLM Nemotron-NAS implementation can be found in [tensorrt_llm/models/nemotron_nas/model.py](../../tensorrt_llm/models/nemotron_nas/model.py). The TensorRT-LLM Nemotron-NAS example code is located in [`examples/nemotron_nas`](./). There is one main file:
The TensorRT-LLM Nemotron-NAS implementation can be found in [tensorrt_llm/models/nemotron_nas/model.py](../../../../tensorrt_llm/models/nemotron_nas/model.py). The TensorRT-LLM Nemotron-NAS example code is located in [`examples/models/core/nemotron_nas`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the model into tensorrt-llm checkpoint format.
@ -93,7 +93,7 @@ In particular, the plugin-related options have two categories:
export DATASET_DIR="~/datasets/nemotron-nas"
python ./calibration_utils.py $DATASET_DIR # download and transform the recommended dataset.
python ../quantization/quantize.py \
python ../../../quantization/quantize.py \
--model_dir $MODEL_DIR \
--output_dir $TRT_CHECKPOINT_DIR \
--dtype bfloat16 \
@ -116,7 +116,7 @@ The conversion script supports additional models with variable GQA, such as [Dec
## Runtime
After you build the engine, you can use the engine with any TensorRT-LLM entrypoint or API.
For example, you can run inference with [examples/run.py](../run.py):
For example, you can run inference with [examples/run.py](../../../run.py):
```bash
export MODEL_DIR="~/models/huggingface/nemotron-nas"

View File

@ -14,14 +14,14 @@ For multimodal models (Phi-3-vision-128k-instruct and Phi-3.5-vision-instruct),
## Overview
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file:
The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/models/core/phi`](./) with a single file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
@ -92,11 +92,11 @@ As previously explained, the first step is to build the TensorRT engine as descr
pip install -r requirements.txt
```
The summarization can be done using the [`../summarize.py`](../summarize.py) script as follows:
The summarization can be done using the [`summarize.py`](../../../summarize.py) script as follows:
```bash
# Run the summarization task using a TensorRT-LLM model and a single GPU.
python3 ../summarize.py --engine_dir ./phi-engine \
python3 ../../../summarize.py --engine_dir ./phi-engine \
--hf_model_dir /path/to/phi-model \
--batch_size 1 \
--test_trt_llm \
@ -107,7 +107,7 @@ python3 ../summarize.py --engine_dir ./phi-engine \
# Run the summarization task using a TensorRT-LLM model and 2-way tensor parallelism.
mpirun -n 2 --allow-run-as-root \
python3 ../summarize.py --engine_dir ./phi-engine-tp2 \
python3 ../../../summarize.py --engine_dir ./phi-engine-tp2 \
--hf_model_dir /path/to/phi-model \
--batch_size 1 \
--test_hf \
@ -126,7 +126,7 @@ FP8 checkpoints can be built as follows:
```bash
DTYPE=bfloat16
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--model_dir phi3-model \
--output_dir ./phi3-checkpoint \
--dtype $DTYPE \
@ -137,7 +137,7 @@ INT8 checkpoints can be built as follows:
```bash
DTYPE=bfloat16
python3 ../quantization/quantize.py \
python3 ../../../quantization/quantize.py \
--model_dir phi3-model \
--output_dir ./phi3-checkpoint \
--dtype $DTYPE \
@ -161,7 +161,7 @@ git-lfs clone https://huggingface.co/sikoraaxd/Phi-3-mini-4k-instruct-ru-lora
* Quantize the Phi-3-mini model to fp8 from HF
```bash
BASE_PHI_3_MINI_MODEL=./Phi-3-mini-4k-instruct
python ../quantization/quantize.py --model_dir ${BASE_PHI_3_MINI_MODEL} \
python ../../../quantization/quantize.py --model_dir ${BASE_PHI_3_MINI_MODEL} \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -170,7 +170,7 @@ python ../quantization/quantize.py --model_dir ${BASE_PHI_3_MINI_MODEL} \
```
* Build engine and run inference.
```
```bash
trtllm-build --checkpoint_dir phi3_mini_4k_instruct/trt_ckpt/fp8/1-gpu \
--output_dir phi3_mini_4k_instruct/trt_engines/fp8_lora/1-gpu \
--gemm_plugin auto \
@ -180,7 +180,7 @@ trtllm-build --checkpoint_dir phi3_mini_4k_instruct/trt_ckpt/fp8/1-gpu \
--lora_plugin auto \
--lora_dir ./Phi-3-mini-4k-instruct-ru-lora
python ../run.py --engine_dir phi3_mini_4k_instruct/trt_engines/fp8_lora/1-gpu \
python ../../../run.py --engine_dir phi3_mini_4k_instruct/trt_engines/fp8_lora/1-gpu \
--max_output_len 500 \
--tokenizer_dir ./Phi-3-mini-4k-instruct-ru-lora \
--input_text "<|user|>\nCan you provide ways to eat combinations of bananas and dragonfruits?<|end|>\n<|assistant|>" \

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate

View File

@ -20,14 +20,14 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
## Overview
The TensorRT-LLM Qwen implementation can be found in [models/qwen](../../tensorrt_llm/models/qwen/). The TensorRT-LLM Qwen example code is located in [`examples/qwen`](./). There is one main file:
The TensorRT-LLM Qwen implementation can be found in [models/qwen](../../../../tensorrt_llm/models/qwen/). The TensorRT-LLM Qwen example code is located in [`examples/models/core/qwen`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Qwen model.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
| Model Name | FP16/BF16 | FP8 | WO | AWQ | GPTQ | SQ | TP | PP | Arch |
@ -72,7 +72,7 @@ Currently Qwen1 models does not support dynamic NTK and logn attention. Therefor
## Usage
The TensorRT-LLM Qwen example code locates at [examples/qwen](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
The TensorRT-LLM Qwen example code locates at [examples/models/core/qwen](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
### Download model weights
@ -257,7 +257,7 @@ First make sure Modelopt toolkit is installed (see [examples/quantization/README
```bash
# Quantize model into FP8 and export trtllm checkpoint
python ../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
python ../../../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -296,7 +296,7 @@ To run the AWQ Qwen example, the following steps are required:
```bash
# Quantize Qwen-7B-Chat checkpoint into INT4 AWQ format
python ../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
python ../../../quantization/quantize.py --model_dir ./tmp/Qwen/7B/ \
--dtype float16 \
--qformat int4_awq \
--awq_block_size 128 \
@ -325,19 +325,19 @@ To run a TensorRT-LLM Qwen model using the engines generated by `trtllm-build`
```bash
# With fp16 inference
python3 ../run.py --input_text "你好,请问你叫什么?" \
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--engine_dir=./tmp/Qwen/7B/trt_engines/fp16/1-gpu/
# With bf16 inference
python3 ../run.py --input_text "你好,请问你叫什么?" \
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--engine_dir=./tmp/Qwen/7B/trt_engines/bf16/1-gpu
# With int8 weight only inference
python3 ../run.py --input_text "你好,请问你叫什么?" \
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--engine_dir=./tmp/Qwen/7B/trt_engines/int8_weight_only/1-gpu/
@ -358,7 +358,7 @@ Output [Text 0 Beam 0]: "你好,我是来自阿里云的大规模语言模型
```bash
# With int4 weight only inference
python3 ../run.py --input_text "你好,请问你叫什么?" \
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--engine_dir=./tmp/Qwen/7B/trt_engines/int4_weight_only/1-gpu/
@ -376,7 +376,7 @@ Output [Text 0 Beam 0]: "我叫通义千问,是由阿里云开发的预训练
```bash
# With INT4 GPTQ quantization
python3 ../run.py --input_text "你好,请问你叫什么?" \
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen-7B-Chat-Int4 \
--engine_dir=./tmp/Qwen/7B/trt_engines/int4_GPTQ/1-gpu/
@ -394,7 +394,7 @@ Output [Text 0 Beam 0]: "你好,我是通义千问,由阿里云开发。<|im
```bash
# With INT4 AWQ quantization
python3 ../run.py --input_text "你好,请问你叫什么?" \
python3 ../../../run.py --input_text "你好,请问你叫什么?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--engine_dir=./tmp/Qwen/7B/trt_engines/int4_AWQ/1-gpu/
@ -413,7 +413,7 @@ Output [Text 0 Beam 0]: "你好,我是通义千问,由阿里云开发。<|im
```bash
# Run 72B model with 8-gpu
mpirun -n 8 --allow-run-as-root \
python ../run.py --input_text "What is your name?" \
python ../../../run.py --input_text "What is your name?" \
--max_output_len=50 \
--tokenizer_dir ./tmp/Qwen/72B/ \
--engine_dir=./tmp/Qwen/72B/trt_engines/fp16/8-gpu/
@ -453,7 +453,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp16 \
Run inference:
```bash
python ../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
python ../../../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
--max_output_len 50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--input_text "안녕하세요, 혹시 이름이 뭐에요?" \
@ -477,7 +477,7 @@ In that case, the model will not run the LoRA module and the results will be
different.
```bash
python ../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
python ../../../run.py --engine_dir ./tmp/qwen/7B_lora/trt_engines/fp16/1-gpu \
--max_output_len 50 \
--tokenizer_dir ./tmp/Qwen/7B/ \
--input_text "안녕하세요, 혹시 이름이 뭐에요?" \
@ -498,7 +498,7 @@ Output [Text 0 Beam 0]: "안녕하세요! 저는 "QianWen"입니다.<|im_end|>
```bash
# Run summarization using the Qwen 7B model in FP16.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/7B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/7B/trt_engines/fp16/1-gpu/ \
@ -506,7 +506,7 @@ python ../summarize.py --test_trt_llm \
--output_len 2048
# Run summarization using the Qwen 7B model in BF16.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/7B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/7B/trt_engines/bf16/1-gpu/ \
@ -514,7 +514,7 @@ python ../summarize.py --test_trt_llm \
--output_len 2048
# Run summarization using the Qwen 7B model quantized to INT8.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/7B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/7B/trt_engines/int8_weight_only/1-gpu/ \
@ -522,7 +522,7 @@ python ../summarize.py --test_trt_llm \
--output_len 2048
# Run summarization using the Qwen 7B model quantized to INT4.
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/7B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/7B/trt_engines/int4_weight_only/1-gpu/ \
@ -531,7 +531,7 @@ python ../summarize.py --test_trt_llm \
# Run summarization using the Qwen 7B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/7B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/7B/trt_engines/fp16/2-gpu/ \
@ -540,7 +540,7 @@ mpirun -n 2 --allow-run-as-root \
# Run summarization using the Qwen 14B model in FP16 using two GPUs.
mpirun -n 2 --allow-run-as-root \
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/14B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/14B/trt_engines/fp16/2-gpu/ \
@ -549,7 +549,7 @@ mpirun -n 2 --allow-run-as-root \
```
**Demo output of summarize.py:**
```bash
python ../summarize.py --test_trt_llm \
python ../../../summarize.py --test_trt_llm \
--hf_model_dir ./tmp/Qwen/7B/ \
--data_type fp16 \
--engine_dir ./tmp/Qwen/7B/trt_engines/fp16/1-gpu/ \

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate

View File

@ -47,7 +47,7 @@
The built Qwen engines are located in `${ENGINE_DIR}/llm`.
You can replace the `--checkpoint_dir` with INT8 Weight Only checkpoint to build INT8 Weight Only engine as well.
For more information about Qwen, refer to the README.md in [`example/qwen`](../qwen).
For more information about Qwen, refer to the README.md in [`example/models/core/qwen`](../qwen).
4. Assemble everything into the Qwen2-Audio pipeline.

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.dev0
datasets==3.1.0
evaluate

View File

@ -25,7 +25,7 @@
```
2. Convert
```bash
python3 ./examples/qwen/convert_checkpoint.py --model_dir=./Qwen-VL-Chat \
python3 ./examples/models/core/qwen/convert_checkpoint.py --model_dir=./Qwen-VL-Chat \
--output_dir=./tllm_checkpoint_1gpu \
--dtype float16
```

View File

Before

Width:  |  Height:  |  Size: 2.4 MiB

After

Width:  |  Height:  |  Size: 2.4 MiB

View File

Before

Width:  |  Height:  |  Size: 485 KiB

After

Width:  |  Height:  |  Size: 485 KiB

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
datasets==3.1.0
evaluate

View File

@ -4,14 +4,14 @@ This document shows how to build and run a [RecurrentGemma](https://github.com/g
## Overview
The TensorRT-LLM RecurrentGemma implementation can be found in [`tensorrt_llm/models/recurrentgemma/model.py`](../../tensorrt_llm/models/recurrentgemma/model.py). The TensorRT-LLM RecurrentGemma example code is located in [`examples/recurrentgemma`](./). There is one main file:
The TensorRT-LLM RecurrentGemma implementation can be found in [`tensorrt_llm/models/recurrentgemma/model.py`](../../../../tensorrt_llm/models/recurrentgemma/model.py). The TensorRT-LLM RecurrentGemma example code is located in [`examples/models/core/recurrentgemma`](./). There is one main file:
* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the JAX format to the TensorRT-LLM format.
In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:
In addition, there are two shared files in the parent folder [`examples`](../../../) for inference and evaluation:
* [`../run.py`](../run.py) to run the inference on an input text;
* [`../summarize.py`](../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
* [`run.py`](../../../run.py) to run the inference on an input text;
* [`summarize.py`](../../../summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset.
## Support Matrix
| Checkpoint type | FP16 | BF16 | FP8 | INT8 SQ | INT4 AWQ | TP |
@ -63,7 +63,7 @@ python convert_checkpoint.py --model_dir ${CKPT_2B_PATH} \
# recurrentgemma-2b-it FP8 with FP8 kv cache
CKPT_2B_IT_PATH=./recurrentgemma_model/recurrentgemma-2b-it
UNIFIED_CKPT_2B_IT_FP8_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_ckpt/fp8/1-gpu/
python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
python ../../../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
@ -73,7 +73,7 @@ python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
UNIFIED_CKPT_2B_IT_INT8_SQ_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_ckpt/int8_sq/1-gpu/
python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
python ../../../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
--dtype float16 \
--qformat int8_sq \
--kv_cache_dtype int8 \
@ -83,7 +83,7 @@ python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
UNIFIED_CKPT_2B_IT_INT4_AWQ_PATH=./recurrentgemma_model/recurrentgemma-2b-it/trt_ckpt/int4_awq/1-gpu/
python ../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
python ../../../quantization/quantize.py --model_dir ${CKPT_2B_IT_PATH} \
--dtype float16 \
--qformat int4_awq \
--kv_cache_dtype int8 \
@ -182,7 +182,7 @@ Note that we need to download the dataset of MMLU first and the evaluation of MM
```bash
# recurrentgemma-2b
TOKENIZER_DIR_2B_PATH=./recurrentgemma_model/recurrentgemma-2b
python3 ../run.py --max_output_len=100 \
python3 ../../../run.py --max_output_len=100 \
--use_py_session \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_PATH} \
@ -190,21 +190,21 @@ python3 ../run.py --max_output_len=100 \
# recurrentgemma-2b-it FP8 with FP8 kv cache
TOKENIZER_DIR_2B_IT_PATH=./recurrentgemma_model/recurrentgemma-2b-it
python3 ../run.py --max_output_len=100 \
python3 ../../../run.py --max_output_len=100 \
--use_py_session \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
--engine_dir ${ENGINE_2B_IT_FP8_PATH}
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
python3 ../run.py --max_output_len=100 \
python3 ../../../run.py --max_output_len=100 \
--use_py_session \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
--engine_dir ${ENGINE_2B_IT_INT8_SQ_PATH}
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
python3 ../run.py --max_output_len=100 \
python3 ../../../run.py --max_output_len=100 \
--use_py_session \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
@ -212,7 +212,7 @@ python3 ../run.py --max_output_len=100 \
# recurrentgemma-2b-flax
VOCAB_FILE_2B_FLAX_PATH=./recurrentgemma_model/recurrentgemma-2b-flax/tokenizer.model
python3 ../run.py --max_output_len=100 \
python3 ../../../run.py --max_output_len=100 \
--use_py_session \
--max_attention_window_size 2048 \
--vocab_file ${VOCAB_FILE_2B_FLAX_PATH} \
@ -220,7 +220,7 @@ python3 ../run.py --max_output_len=100 \
# recurrentgemma-2b-it-flax
VOCAB_FILE_2B_IT_FLAX_PATH=./recurrentgemma_model/recurrentgemma-2b-it-flax/tokenizer.model
python3 ../run.py --max_output_len=100 \
python3 ../../../run.py --max_output_len=100 \
--use_py_session \
--max_attention_window_size 2048 \
--vocab_file ${VOCAB_FILE_2B_IT_FLAX_PATH} \
@ -231,7 +231,7 @@ python3 ../run.py --max_output_len=100 \
```bash
# recurrentgemma-2b
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--use_py_session \
--engine_dir ${ENGINE_2B_PATH} \
--batch_size 8 \
@ -239,7 +239,7 @@ python3 ../summarize.py --test_trt_llm \
--tokenizer_dir ${TOKENIZER_DIR_2B_PATH}
# recurrentgemma-2b-it FP8 with FP8 kv cache
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--use_py_session \
--engine_dir ${ENGINE_2B_IT_FP8_PATH} \
--batch_size 8 \
@ -247,7 +247,7 @@ python3 ../summarize.py --test_trt_llm \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH}
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--use_py_session \
--engine_dir ${ENGINE_2B_IT_INT8_SQ_PATH} \
--batch_size 8 \
@ -255,7 +255,7 @@ python3 ../summarize.py --test_trt_llm \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH}
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--use_py_session \
--engine_dir ${ENGINE_2B_IT_INT4_AWQ_PATH} \
--batch_size 8 \
@ -263,7 +263,7 @@ python3 ../summarize.py --test_trt_llm \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH}
# recurrentgemma-2b-flax
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--use_py_session \
--engine_dir ${ENGINE_2B_FLAX_PATH} \
--batch_size 8 \
@ -271,7 +271,7 @@ python3 ../summarize.py --test_trt_llm \
--vocab_file ${VOCAB_FILE_2B_FLAX_PATH}
# recurrentgemma-2b-it-flax
python3 ../summarize.py --test_trt_llm \
python3 ../../../summarize.py --test_trt_llm \
--use_py_session \
--engine_dir ${ENGINE_2B_IT_FLAX_PATH} \
--batch_size 8 \
@ -294,37 +294,37 @@ Evaluate on MMLU dataset.
```bash
# recurrentgemma-2b
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_PATH} \
--engine_dir ${ENGINE_2B_PATH}
# recurrentgemma-2b-it FP8 with FP8 kv cache
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
--engine_dir ${ENGINE_2B_IT_FP8_PATH}
# recurrentgemma-2b-it INT8 SmoothQuant with INT8 kv cache
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
--engine_dir ${ENGINE_2B_IT_INT8_SQ_PATH}
# recurrentgemma-2b-it INT4 AWQ with INT8 kv cache
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--max_attention_window_size 2048 \
--tokenizer_dir ${TOKENIZER_DIR_2B_IT_PATH} \
--engine_dir ${ENGINE_2B_IT_INT4_AWQ_PATH}
# recurrentgemma-2b-flax
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--max_attention_window_size 2048 \
--vocab_file ${VOCAB_FILE_2B_FLAX_PATH} \
--engine_dir ${ENGINE_2B_FLAX_PATH}
# recurrentgemma-2b-it-flax
python3 ../mmlu.py --test_trt_llm \
python3 ../../../mmlu.py --test_trt_llm \
--max_attention_window_size 2048 \
--vocab_file ${VOCAB_FILE_2B_IT_FLAX_PATH} \
--engine_dir ${ENGINE_2B_IT_FLAX_PATH}

View File

@ -1,4 +1,4 @@
-c ../constraints.txt
-c ../../../constraints.txt
tensorrt_llm>=0.0.0.dev0
git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
flax>=0.8.2

Some files were not shown because too many files have changed in this diff Show More