mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: meghagarwal <16129366+megha95@users.noreply.github.com> Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
65 lines
2.6 KiB
Python
65 lines
2.6 KiB
Python
import argparse
|
|
|
|
from tensorrt_llm.quantization import quantize_and_export
|
|
|
|
if __name__ == "__main__":
|
|
DEFAULT_RAND_SEED = 1234
|
|
DEFAULT_MAX_SEQ_LEN = 2048
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--model_dir",
|
|
help="Specify where the HuggingFace model is",
|
|
required=True)
|
|
parser.add_argument("--device", default="cuda")
|
|
parser.add_argument("--dtype", help="Model data type.", default="float16")
|
|
parser.add_argument(
|
|
"--qformat",
|
|
help="Quantization format.",
|
|
default="full_prec",
|
|
choices=[
|
|
"fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo",
|
|
"full_prec"
|
|
],
|
|
)
|
|
parser.add_argument(
|
|
"--seed",
|
|
help="Seed the generate random numbers, the value will be used to call"
|
|
"random.seed(value) and numpy.random.seed(value)",
|
|
type=int,
|
|
default=DEFAULT_RAND_SEED)
|
|
parser.add_argument("--max_seq_length",
|
|
help="Max sequence length to init the tokenizers",
|
|
type=int,
|
|
default=DEFAULT_MAX_SEQ_LEN)
|
|
|
|
parser.add_argument("--batch_size",
|
|
help="Batch size for calibration.",
|
|
type=int,
|
|
default=1)
|
|
parser.add_argument("--calib_size",
|
|
help="Number of samples for calibration.",
|
|
type=int,
|
|
default=512)
|
|
parser.add_argument("--output_dir", default="exported_model")
|
|
parser.add_argument("--tp_size", type=int, default=1)
|
|
parser.add_argument("--pp_size", type=int, default=1)
|
|
parser.add_argument("--awq_block_size", type=int, default=128)
|
|
parser.add_argument("--kv_cache_dtype",
|
|
help="KV Cache dtype.",
|
|
default=None,
|
|
choices=["int8", "fp8", None])
|
|
args = parser.parse_args()
|
|
|
|
quantize_and_export(model_dir=args.model_dir,
|
|
dtype=args.dtype,
|
|
output_dir=args.output_dir,
|
|
device=args.device,
|
|
tp_size=args.tp_size,
|
|
pp_size=args.pp_size,
|
|
qformat=args.qformat,
|
|
kv_cache_dtype=args.kv_cache_dtype,
|
|
calib_size=args.calib_size,
|
|
batch_size=args.batch_size,
|
|
awq_block_size=args.awq_block_size,
|
|
seed=args.seed,
|
|
max_seq_length=args.max_seq_length)
|