TensorRT-LLMs/examples/auto_deploy/onnx_export_llm.py
nvyocox 4af47208d8
[None][feat] Export ONNX for DriveOS LLM (#10117)
Signed-off-by: yocox <yocox@nvidia.com>
2026-01-30 15:43:11 -05:00

79 lines
2.6 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ONNX export script for AutoDeploy models.
This script exports a HuggingFace model to ONNX format using the AutoDeploy
transform pipeline directly, without initializing the full LLM executor.
"""
import argparse
from tensorrt_llm._torch.auto_deploy.export import export_onnx
from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs
def main():
parser = argparse.ArgumentParser(
description="Export HuggingFace model to ONNX format using AutoDeploy."
)
parser.add_argument(
"--model",
type=str,
required=True,
help="The HF model to use for onnx export.",
)
parser.add_argument(
"--device",
type=str,
default="cpu",
help="The device to use when exporting the model.",
)
parser.add_argument(
"--output_dir",
type=str,
default=None,
help="The directory to save the exported ONNX model.",
)
args = parser.parse_args()
print(f"Constructing model from {args.model}")
# to enable dynamic batch_size, the batch size must > 1
# NOTE(yoco): Originally this is 2, however, don't know why, when set to 2,
# the batch_size will collapse static int 2 even we explicitly it is dynamic axis.
# And more weird, when set to 13, the batch_size will be dynamic.
# Probably some value between 2 and 13 will work,
# We use 13 here for debugging purpose.
max_batch_size = 13
max_seq_len = 4
# Prepare the AutoDeploy config, mode is export_edgellm_onnx
ad_config = LlmArgs(
model=args.model,
mode="export_edgellm_onnx",
max_batch_size=max_batch_size,
max_seq_len=max_seq_len,
device=args.device,
)
ad_config.attn_backend = "torch"
if args.output_dir is not None:
ad_config.transforms["export_to_onnx"]["output_dir"] = args.output_dir
# Use direct InferenceOptimizer instead of LLM to avoid executor initialization
export_onnx(ad_config)
if __name__ == "__main__":
main()