TensorRT-LLMs/examples/auto_deploy/onnx_export_llm.py

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ONNX export script for AutoDeploy models.

This script exports a HuggingFace model to ONNX format using the AutoDeploy
transform pipeline directly, without initializing the full LLM executor.
"""

import argparse

from tensorrt_llm._torch.auto_deploy.export import export_onnx
from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs


def main():
    parser = argparse.ArgumentParser(
        description="Export HuggingFace model to ONNX format using AutoDeploy."
    )
    parser.add_argument(
        "--model",
        type=str,
        required=True,
        help="The HF model to use for onnx export.",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cpu",
        help="The device to use when exporting the model.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=None,
        help="The directory to save the exported ONNX model.",
    )
    args = parser.parse_args()

    print(f"Constructing model from {args.model}")

    # to enable dynamic batch_size, the batch size must > 1
    # NOTE(yoco): Originally this is 2, however, don't know why, when set to 2,
    # the batch_size will collapse static int 2 even we explicitly it is dynamic axis.
    # And more weird, when set to 13, the batch_size will be dynamic.
    # Probably some value between 2 and 13 will work,
    # We use 13 here for debugging purpose.
    max_batch_size = 13
    max_seq_len = 4

    # Prepare the AutoDeploy config, mode is export_edgellm_onnx
    ad_config = LlmArgs(
        model=args.model,
        mode="export_edgellm_onnx",
        max_batch_size=max_batch_size,
        max_seq_len=max_seq_len,
        device=args.device,
    )
    ad_config.attn_backend = "torch"
    if args.output_dir is not None:
        ad_config.transforms["export_to_onnx"]["output_dir"] = args.output_dir

    # Use direct InferenceOptimizer instead of LLM to avoid executor initialization
    export_onnx(ad_config)


if __name__ == "__main__":
    main()