mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> Co-authored-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
31 lines
803 B
YAML
31 lines
803 B
YAML
# Configuration for Flux model transforms (export, optimizer, compile)
|
|
# Usage: python build_and_run_flux.py --config flux_transforms.yaml
|
|
|
|
# Export configuration
|
|
export:
|
|
clone: false
|
|
strict: false
|
|
|
|
# TODO: Integrate these transforms into the optimizer
|
|
# Optimizer configuration - FP8/FP4 quantization and fusion
|
|
# optimizer:
|
|
# quantize_fp8_from_graph:
|
|
# stage: "pattern_matcher"
|
|
# quantize_nvfp4_from_graph:
|
|
# stage: "pattern_matcher"
|
|
# fuse_fp8_gemms:
|
|
# stage: "post_load_fusion"
|
|
# fuse_fp4_gemms:
|
|
# stage: "post_load_fusion"
|
|
# fuse_fp8_linear:
|
|
# stage: "post_load_fusion"
|
|
# backend: "torch"
|
|
# fuse_nvfp4_linear:
|
|
# stage: "post_load_fusion"
|
|
# backend: "trtllm"
|
|
|
|
# Compilation configuration
|
|
compile:
|
|
backend: "torch-opt"
|
|
cuda_graph_batch_sizes: null
|