From d160439ef92910c09bd18dd9d30ee789551ddd6d Mon Sep 17 00:00:00 2001 From: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com> Date: Thu, 5 Feb 2026 21:32:22 -0800 Subject: [PATCH] [#11148][feat] AutoDeploy: Better structure the custom op (#11152) Signed-off-by: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com> --- .../_torch/auto_deploy/custom_ops/__init__.py | 15 ++++++++ .../custom_ops/attention/__init__.py | 36 +++++++++++++++++++ .../{ => attention}/flashinfer_attention.py | 27 ++++++++++---- .../{ => attention}/onnx_attention.py | 5 +-- .../{ => attention}/torch_attention.py | 15 ++++++++ .../torch_backend_attention.py | 23 +++++++++--- .../{ => attention}/triton_attention.py | 25 ++++++++++--- .../triton_attention_with_kv_cache.py} | 15 ++++++++ .../custom_ops/attention_interface.py | 15 ++++++++ .../custom_ops/distributed/__init__.py | 26 ++++++++++++++ .../{ => distributed}/torch_dist.py | 17 ++++++++- .../{ => distributed}/trtllm_dist.py | 23 +++++++++--- .../auto_deploy/custom_ops/fla/__init__.py | 14 ++++++++ .../custom_ops/fla/delta_rule/__init__.py | 14 ++++++++ .../custom_ops/fla/fla_backend_delta.py | 15 ++++++++ .../auto_deploy/custom_ops/fla/fla_delta.py | 15 ++++++++ .../custom_ops/fused_moe/__init__.py | 14 ++++++++ .../custom_ops/fused_moe/load_moe_align.py | 15 ++++++++ .../custom_ops/fused_moe/mxfp4_moe.py | 15 ++++++++ .../custom_ops/fused_moe/torch_moe.py | 15 ++++++++ .../custom_ops/fused_moe/triton_moe.py | 15 ++++++++ .../custom_ops/fused_moe/trtllm_moe.py | 4 ++- .../auto_deploy/custom_ops/linear/__init__.py | 26 ++++++++++++++ .../custom_ops/{ => linear}/linear.py | 15 ++++++++ .../custom_ops/{ => linear}/torch_router.py | 15 ++++++++ .../auto_deploy/custom_ops/mamba/__init__.py | 14 ++++++++ .../mamba/flashinfer_backend_mamba.py | 15 ++++++++ .../mamba/torch_backend_causal_conv.py | 15 ++++++++ .../custom_ops/mamba/torch_backend_mamba.py | 15 ++++++++ .../custom_ops/mamba/torch_causal_conv.py | 15 ++++++++ .../custom_ops/mamba/torch_mamba.py | 15 ++++++++ .../auto_deploy/custom_ops/mla/__init__.py | 24 +++++++++++++ .../auto_deploy/custom_ops/{ => mla}/mla.py | 21 +++++++++-- .../custom_ops/normalization/__init__.py | 30 ++++++++++++++++ .../flashinfer_fused_add_rms_norm.py | 2 +- .../custom_ops/{ => normalization}/l2norm.py | 15 ++++++++ .../{ => normalization}/rms_norm.py | 21 +++++++++-- .../triton_rms_norm.py} | 15 ++++++++ .../custom_ops/quantization/__init__.py | 26 ++++++++++++++ .../custom_ops/{ => quantization}/quant.py | 17 ++++++++- .../{ => quantization}/torch_quant.py | 15 ++++++++ .../auto_deploy/custom_ops/rope/__init__.py | 30 ++++++++++++++++ .../custom_ops/{ => rope}/flashinfer_rope.py | 15 ++++++++ .../custom_ops/{ => rope}/torch_rope.py | 15 ++++++++ .../custom_ops/{ => rope}/triton_rope.py | 17 ++++++++- .../rope.py => rope/triton_rope_kernel.py} | 15 ++++++++ .../custom_ops/torch_libs/__init__.py | 14 ++++++++ .../custom_ops/triton_kernels/__init__.py | 0 .../auto_deploy/custom_ops/utils/__init__.py | 26 ++++++++++++++ .../{ => utils}/torch_gather_logits.py | 15 ++++++++ .../custom_ops/{ => utils}/triton_utils.py | 15 ++++++++ .../models/custom/modeling_nemotron_h.py | 2 +- .../transform/library/fused_add_rms_norm.py | 2 +- .../transform/library/fused_moe.py | 5 ++- .../transform/library/quantization.py | 2 +- .../auto_deploy/transform/library/rms_norm.py | 2 +- .../auto_deploy/transform/library/sharding.py | 2 +- .../auto_deploy/utils/quantization_utils.py | 2 +- .../test_allreduce_residual_rmsnorm_fusion.py | 4 ++- .../library/test_rmsnorm_sharding.py | 2 +- .../{ => attention}/test_attention_op.py | 0 .../test_flashinfer_attention_op.py | 4 ++- .../test_torch_attention_op.py | 0 .../test_triton_attention_with_kv_cache.py} | 2 +- .../test_cuda_causal_conv_cached_op.py | 0 .../test_flashinfer_mamba_cached_op.py | 2 +- .../test_torch_causal_conv_cached_op.py | 0 .../{ => mamba}/test_torch_mamba_cached_op.py | 0 .../test_triton_mamba_cached_op.py | 0 .../custom_ops/{ => moe}/test_ad_moe_op.py | 0 .../test_triton_moe.py | 0 .../custom_ops/{ => moe}/test_trtllm_moe.py | 2 +- .../test_flashinfer_fused_add_rms_norm_op.py | 2 +- .../test_mamba_rms_norm.py | 2 +- .../test_triton_rms_norm.py | 4 +-- .../{ => quantization}/test_quant.py | 0 .../{ => rope}/test_rope_op_variants.py | 0 .../test_rope.py => rope/test_triton_rope.py} | 0 .../custom_ops/test_update_kv_cache.py | 2 +- .../test_triton_utils.py | 2 +- .../library/test_fuse_l2norm.py | 2 +- .../library/test_fuse_rmsnorm.py | 2 +- .../library/test_fuse_rope_attention.py | 4 +-- .../library/test_fused_add_rms_norm.py | 4 +-- .../utils/test_quantization_utils.py | 2 +- 85 files changed, 872 insertions(+), 58 deletions(-) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/flashinfer_attention.py (95%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/onnx_attention.py (97%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/torch_attention.py (96%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/torch_backend_attention.py (95%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/triton_attention.py (93%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{triton_kernels/attention_with_kv_cache.py => attention/triton_attention_with_kv_cache.py} (97%) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => distributed}/torch_dist.py (72%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => distributed}/trtllm_dist.py (82%) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => linear}/linear.py (55%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => linear}/torch_router.py (68%) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => mla}/mla.py (91%) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => normalization}/flashinfer_fused_add_rms_norm.py (97%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => normalization}/l2norm.py (60%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => normalization}/rms_norm.py (92%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{triton_kernels/rms_norm.py => normalization/triton_rms_norm.py} (67%) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => quantization}/quant.py (94%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => quantization}/torch_quant.py (95%) create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => rope}/flashinfer_rope.py (77%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => rope}/torch_rope.py (85%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => rope}/triton_rope.py (83%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{triton_kernels/rope.py => rope/triton_rope_kernel.py} (89%) delete mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/__init__.py create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => utils}/torch_gather_logits.py (70%) rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => utils}/triton_utils.py (80%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => attention}/test_attention_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => attention}/test_flashinfer_attention_op.py (99%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => attention}/test_torch_attention_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels/test_attention_with_kv_cache.py => attention/test_triton_attention_with_kv_cache.py} (99%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_cuda_causal_conv_cached_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_flashinfer_mamba_cached_op.py (98%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_torch_causal_conv_cached_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_torch_mamba_cached_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_triton_mamba_cached_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => moe}/test_ad_moe_op.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels => moe}/test_triton_moe.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => moe}/test_trtllm_moe.py (99%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => normalization}/test_flashinfer_fused_add_rms_norm_op.py (93%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => normalization}/test_mamba_rms_norm.py (93%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels => normalization}/test_triton_rms_norm.py (76%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => quantization}/test_quant.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => rope}/test_rope_op_variants.py (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels/test_rope.py => rope/test_triton_rope.py} (100%) rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels => utils}/test_triton_utils.py (98%) diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py index 8098a61987..ecedb47ee9 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops and make sure they are all registered.""" import importlib diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py new file mode 100644 index 0000000000..ead9ba122d --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Attention operations. + +This module provides various attention implementations and backends: +- torch_attention: PyTorch reference implementations +- torch_backend_attention: PyTorch-based attention backend +- flashinfer_attention: FlashInfer-based optimized attention +- triton_attention: Triton-based attention implementations +- triton_attention_with_kv_cache: Triton attention with KV cache support +- triton_attention_with_paged_kv_cache: Triton attention with paged KV cache +- onnx_attention: Placeholder ops for ONNX export of attention mechanisms +""" + +__all__ = [ + "torch_attention", + "torch_backend_attention", + "flashinfer_attention", + "triton_attention", + "triton_attention_with_kv_cache", + "triton_attention_with_paged_kv_cache", + "onnx_attention", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py similarity index 95% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py index 4767ad82a8..4183f5148c 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass, fields from typing import Dict, List, Literal, Optional, Tuple, Union @@ -7,12 +22,12 @@ from torch._ops import OpOverloadPacket from torch._subclasses import FakeTensor from torch.fx import Node -from ....llmapi.llm_args import KvCacheConfig -from ...flashinfer_utils import get_env_enable_pdl -from ..utils.cuda_graph import cuda_graph_state -from ..utils.logger import ad_logger -from ..utils.node_utils import extract_op_args -from .attention_interface import ( +from .....llmapi.llm_args import KvCacheConfig +from ....flashinfer_utils import get_env_enable_pdl +from ...utils.cuda_graph import cuda_graph_state +from ...utils.logger import ad_logger +from ...utils.node_utils import extract_op_args +from ..attention_interface import ( AttentionDescriptor, AttentionLayout, AttentionRegistry, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/onnx_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/onnx_attention.py similarity index 97% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/onnx_attention.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/onnx_attention.py index 6e3de6149e..c729c27fe4 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/onnx_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/onnx_attention.py @@ -1,10 +1,11 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_attention.py similarity index 96% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_attention.py index 7fdacdb35e..da76b1e52e 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_attention.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Torch reference implementations for attention.""" import math diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_backend_attention.py similarity index 95% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_backend_attention.py index ad806c68ae..a8f68574c5 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_backend_attention.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Torch backend attention using pure PyTorch reference implementations.""" import math @@ -8,10 +23,10 @@ from torch._ops import OpOverloadPacket from torch._subclasses import FakeTensor from torch.fx import Node -from ....llmapi.llm_args import KvCacheConfig -from ..utils.logger import ad_logger -from ..utils.node_utils import extract_op_args -from .attention_interface import ( +from .....llmapi.llm_args import KvCacheConfig +from ...utils.logger import ad_logger +from ...utils.node_utils import extract_op_args +from ..attention_interface import ( AttentionDescriptor, AttentionLayout, AttentionRegistry, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention.py similarity index 93% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention.py index d99e428c76..70eb07e50d 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops for MHA/XQA attention.""" import math @@ -9,10 +24,10 @@ from torch._ops import OpOverloadPacket from torch._subclasses import FakeTensor from torch.fx import Node -from ....llmapi.llm_args import KvCacheConfig -from ..utils.logger import ad_logger -from ..utils.node_utils import extract_op_args -from .attention_interface import ( +from .....llmapi.llm_args import KvCacheConfig +from ...utils.logger import ad_logger +from ...utils.node_utils import extract_op_args +from ..attention_interface import ( AttentionDescriptor, AttentionLayout, AttentionRegistry, @@ -21,7 +36,7 @@ from .attention_interface import ( ResourceHandlerDict, UnpagedResourceHandler, ) -from .triton_kernels.attention_with_kv_cache import ( +from .triton_attention_with_kv_cache import ( attention_kv_stage2, context_attention_kv_flattened, gqa_attention_kv_stage1, diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention_with_kv_cache.py similarity index 97% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention_with_kv_cache.py index d93c6a0464..15372e7bab 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention_with_kv_cache.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Multi-head attention kernel that can operate with kv-caches.""" import triton diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py index 2169585849..4af2891c0a 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Attention Interface to handle various attention operators and cache operations. This module provides an interface between the high-level runtime and cache management system and diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py new file mode 100644 index 0000000000..9fa04a2089 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Distributed operations. + +This module provides distributed communication primitives: +- torch_dist: PyTorch distributed backend operations +- trtllm_dist: TensorRT-LLM optimized distributed operations +""" + +__all__ = [ + "torch_dist", + "trtllm_dist", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/torch_dist.py similarity index 72% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_dist.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/torch_dist.py index 5cbda61970..5e618ecafd 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_dist.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/torch_dist.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops required for implementing tensor parallelism. This module defines atomic distributed ops - each op uses a specific backend @@ -8,7 +23,7 @@ from typing import List, Optional import torch -from ..distributed import common as dist +from ...distributed import common as dist # ============================================================================ # PyTorch Distributed Backend Ops (demollm mode) diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py similarity index 82% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py index 1f4b9229ca..a213ddb0d9 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """TRT-LLM distributed operations and fused kernels. This module defines atomic TRT-LLM-specific ops that use optimized kernels. @@ -9,10 +24,10 @@ from typing import List, Optional import torch # use trtllm distributed ops to improve TP performance if possible -from ....mapping import Mapping -from ...distributed import AllReduce, allgather -from ...modules.linear import AllReduceFusionOp, AllReduceParams, AllReduceStrategy -from ..distributed.common import ReduceOp, get_rank_world_size, get_world_size, is_ompi +from .....mapping import Mapping +from ....distributed import AllReduce, allgather +from ....modules.linear import AllReduceFusionOp, AllReduceParams, AllReduceStrategy +from ...distributed.common import ReduceOp, get_rank_world_size, get_world_size, is_ompi # Cache AllReduce modules to avoid recreating on every call # This is critical for CUDA graph compatibility - recreating modules during diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py index e69de29bb2..ea14aa9647 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py index e69de29bb2..ea14aa9647 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py index 5c52410f24..6026dfe4d5 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Cached attention op for delta rule using the fla kernel library. Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484 diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py index 22d3445535..9cbe89a664 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops corresponding to fla's chunked delta rule. Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484 diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py index e69de29bb2..ea14aa9647 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py index 2d0ee66cfc..3ff67e4378 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ AOT-compiled moe_align CUDA kernel. diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py index c0a55d6c23..91842847e8 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Triton-kernels-based MXFP4 MoE ops (GPT-OSS style) with routing, swizzling, and fused activation from typing import Callable, Tuple diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py index df99815f97..dd963c2fc1 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Callable, List import torch diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py index d33b752532..713d7dba03 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Triton implementation of the Fused MOE ops. Inspired by vLLM's triton MOE implementation. """ diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py index 7a7a53d960..57f3392a26 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py @@ -15,7 +15,9 @@ import torch -from tensorrt_llm._torch.auto_deploy.custom_ops.quant import TRTLLM_NVFP4_SCALING_VECTOR_SIZE +from tensorrt_llm._torch.auto_deploy.custom_ops.quantization.quant import ( + TRTLLM_NVFP4_SCALING_VECTOR_SIZE, +) from tensorrt_llm._torch.utils import ActivationType diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py new file mode 100644 index 0000000000..b11ccc7ee6 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Linear operations. + +This module provides linear layer implementations: +- linear: Linear layer operations +- torch_router: MoE router operations +""" + +__all__ = [ + "linear", + "torch_router", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/linear.py similarity index 55% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/linear/linear.py index 214626ad24..4a7a04adef 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/linear.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops for linear layers.""" from typing import Optional diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_router.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/torch_router.py similarity index 68% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_router.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/linear/torch_router.py index 57613c5e77..2cfc4a508d 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_router.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/torch_router.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import torch.nn.functional as F diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py index e69de29bb2..ea14aa9647 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py index f3ec5314f1..15d46a329d 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import List import torch diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py index 2d2aab3b2c..75b73088a9 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom op collection for cached causal conv1d in pure PyTorch. This mirrors the structure used by the cached Mamba/SSM ops: diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py index dfd3cb07ce..7d9e04abc3 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom op collection for cached mamba2 ssm transform (linear attention) in pure PyTorch. This file contains two kinds of functionality: diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py index a53f984012..28b15388fb 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom op collection for uncached causal conv (sliding window with 1d).""" from typing import Optional diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py index 752520a74a..dbec15699e 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom op collection for uncached mamba mixer (linear attention).""" from typing import List, Tuple diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py new file mode 100644 index 0000000000..b2c4737b67 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Multi-head Latent Attention operations. + +This module provides Multi-head Latent Attention (MLA) implementations: +- mla: MLA operations and attention descriptor +""" + +__all__ = [ + "mla", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/mla.py similarity index 91% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/mla/mla.py index e8673ae20e..f435fc5818 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/mla.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops for MultiHead Latent attention.""" import math @@ -7,8 +22,9 @@ import torch from torch._ops import OpOverloadPacket from torch.fx import Node -from ....llmapi.llm_args import KvCacheConfig -from .attention_interface import ( +from .....llmapi.llm_args import KvCacheConfig +from ..attention.triton_attention import _decode_attention, _prefill_attention +from ..attention_interface import ( AttentionDescriptor, AttentionLayout, AttentionRegistry, @@ -16,7 +32,6 @@ from .attention_interface import ( ResourceHandlerDict, UnpagedResourceHandler, ) -from .triton_attention import _decode_attention, _prefill_attention Constant = Union[int, float, str, None] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py new file mode 100644 index 0000000000..f9a763fc2f --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Normalization operations. + +This module provides various normalization implementations: +- rms_norm: RMSNorm implementations (FlashInfer, Triton, reference) +- triton_rms_norm: Low-level Triton RMSNorm kernel +- l2norm: L2 normalization operations +- flashinfer_fused_add_rms_norm: Fused add + RMSNorm operation +""" + +__all__ = [ + "rms_norm", + "triton_rms_norm", + "l2norm", + "flashinfer_fused_add_rms_norm", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/flashinfer_fused_add_rms_norm.py similarity index 97% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/flashinfer_fused_add_rms_norm.py index d7a183ce90..61b73a77c5 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/flashinfer_fused_add_rms_norm.py @@ -12,7 +12,7 @@ import flashinfer import torch -from ...flashinfer_utils import get_env_enable_pdl +from ....flashinfer_utils import get_env_enable_pdl @torch.library.custom_op( diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/l2norm.py similarity index 60% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/l2norm.py index 28380c346d..6e030a1362 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/l2norm.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom ops corresponding to l2norm.""" import torch diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/rms_norm.py similarity index 92% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/rms_norm.py index 2f85d87518..45a7080d5a 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/rms_norm.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Custom operator for FlashInfer and Triton RMSNorm implementation.""" import flashinfer @@ -6,9 +21,9 @@ import torch.distributed as dist import torch.nn.functional as F from einops import rearrange -from ...flashinfer_utils import get_env_enable_pdl -from ...modules.mamba.layernorm_gated import _layer_norm_fwd -from .triton_kernels.rms_norm import rms_norm +from ....flashinfer_utils import get_env_enable_pdl +from ....modules.mamba.layernorm_gated import _layer_norm_fwd +from .triton_rms_norm import rms_norm @torch.library.custom_op("auto_deploy::flashinfer_rms_norm", mutates_args=()) diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/triton_rms_norm.py similarity index 67% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rms_norm.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/triton_rms_norm.py index 70c67ac80a..6c1d3774f3 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/triton_rms_norm.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import triton import triton.language as tl diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py new file mode 100644 index 0000000000..e968c77643 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Quantization operations. + +This module provides quantization utilities and operations: +- quant: Quantization operations (FP8, FP4, INT4, INT8) +- torch_quant: PyTorch-based quantization implementations +""" + +__all__ = [ + "quant", + "torch_quant", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/quant.py similarity index 94% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/quant.py index cfb4049923..ff5e1133c0 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/quant.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Definition of the quant module that can be used for PTQ.""" import warnings @@ -7,7 +22,7 @@ import torch from flashinfer import bmm_fp8 from torch import nn -from .torch_libs.float8_python_api import addmm_float8_unwrapped +from ..torch_libs.float8_python_api import addmm_float8_unwrapped TRTLLM_FP4_OP_AVAILABLE = True diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/torch_quant.py similarity index 95% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/torch_quant.py index 78a1fe5d83..4ad8dd7b8e 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/torch_quant.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import List, Optional import torch diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py new file mode 100644 index 0000000000..3f1cf25763 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RoPE (Rotary Position Embedding) operations. + +This module provides various RoPE implementations: +- torch_rope: PyTorch reference implementation +- flashinfer_rope: FlashInfer-based optimized RoPE +- triton_rope: Triton-based RoPE implementation +- triton_rope_kernel: Low-level Triton kernels for RoPE +""" + +__all__ = [ + "torch_rope", + "flashinfer_rope", + "triton_rope", + "triton_rope_kernel", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/flashinfer_rope.py similarity index 77% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_rope.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/flashinfer_rope.py index e4f329eeec..defe5fca5f 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_rope.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/flashinfer_rope.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Tuple import flashinfer diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/torch_rope.py similarity index 85% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_rope.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/torch_rope.py index da769158b6..ce35e51b85 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_rope.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/torch_rope.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Tuple import torch diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope.py similarity index 83% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_rope.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope.py index b9282d3038..3c5d79c0f0 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_rope.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope.py @@ -1,7 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import triton -from .triton_kernels.rope import rope_fwd_flattened_kernel, rope_fwd_kernel +from .triton_rope_kernel import rope_fwd_flattened_kernel, rope_fwd_kernel @torch.library.custom_op("auto_deploy::triton_rope_with_input_pos", mutates_args=()) diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope_kernel.py similarity index 89% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rope.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope_kernel.py index dc3701edd2..4139d3a9d9 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rope.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope_kernel.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import triton import triton.language as tl diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py index e69de29bb2..ea14aa9647 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py new file mode 100644 index 0000000000..bbd45886bc --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility operations. + +This module provides utility functions and helpers: +- torch_gather_logits: Logit gathering operations +- triton_utils: Triton utility functions and helpers +""" + +__all__ = [ + "torch_gather_logits", + "triton_utils", +] diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_gather_logits.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/torch_gather_logits.py similarity index 70% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_gather_logits.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/utils/torch_gather_logits.py index 7669ea8966..e86097d687 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_gather_logits.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/torch_gather_logits.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/triton_utils.py similarity index 80% rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py rename to tensorrt_llm/_torch/auto_deploy/custom_ops/utils/triton_utils.py index f65a570bc1..64c2b01aa3 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/triton_utils.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Triton utility operations for auto_deploy.""" import torch diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py index 259f997afd..b71a0da2f6 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py +++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py @@ -31,7 +31,7 @@ from transformers.generation import GenerationMixin from transformers.modeling_utils import PreTrainedModel from transformers.utils import ModelOutput -from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import gated_rms_norm_ref +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import gated_rms_norm_ref from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory from tensorrt_llm._torch.utils import ActivationType diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py index d0bfeee09b..103578ca01 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py @@ -16,7 +16,7 @@ from typing import Tuple import torch from torch.fx import GraphModule -from ...custom_ops.flashinfer_fused_add_rms_norm import flashinfer_fused_add_rms_norm +from ...custom_ops.normalization.flashinfer_fused_add_rms_norm import flashinfer_fused_add_rms_norm from ...models.factory import ModelFactory from ...shim.interface import CachedSequenceInterface from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py index f7dc25e0f0..94dc64892c 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py @@ -9,7 +9,10 @@ from torch.fx import GraphModule, Node from tensorrt_llm._torch.utils import ActivationType -from ...custom_ops.quant import TRTLLM_NVFP4_PACKING_FACTOR, TRTLLM_NVFP4_SCALING_VECTOR_SIZE +from ...custom_ops.quantization.quant import ( + TRTLLM_NVFP4_PACKING_FACTOR, + TRTLLM_NVFP4_SCALING_VECTOR_SIZE, +) from ...models.factory import ModelFactory from ...shim.interface import CachedSequenceInterface from ...utils._graph import delete_all_unused_submodules, eliminate_dead_code, get_attr_by_name diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py index a9254b1294..d86c8244a5 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn from torch.fx import GraphModule, Node -from ...custom_ops.quant import ( +from ...custom_ops.quantization.quant import ( FP4_GLOBAL_SCALE_MAX, FP8_MAX, TRTLLM_NVFP4_COLUMN_SIZE, diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py index 5e06842989..9c5c5247f4 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py @@ -6,7 +6,7 @@ import torch from pydantic import Field from torch.fx import GraphModule, Node -from ...custom_ops.rms_norm import gated_rms_norm_ref +from ...custom_ops.normalization.rms_norm import gated_rms_norm_ref from ...models.factory import ModelFactory from ...shim.interface import CachedSequenceInterface diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py index f93b506201..9d6139dfec 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py @@ -30,7 +30,7 @@ from pydantic import BaseModel, Field, field_validator from torch.fx import GraphModule, Node from .....functional import AllReduceStrategy -from ...custom_ops.trtllm_dist import is_trtllm_op_available +from ...custom_ops.distributed.trtllm_dist import is_trtllm_op_available from ...models.factory import ModelFactory, ShardingConfigSource from ...shim.interface import CachedSequenceInterface from ...utils._graph import del_attr_by_name, eliminate_dead_code diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py index 889b06edb0..dd32386f50 100644 --- a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py +++ b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py @@ -5,7 +5,7 @@ import torch import torch.nn.functional as F from torch.fx import GraphModule, Node -from ..custom_ops.quant import FP4_GLOBAL_SCALE_MAX, FP8_MAX +from ..custom_ops.quantization.quant import FP4_GLOBAL_SCALE_MAX, FP8_MAX from .logger import ad_logger from .node_utils import ( extract_weight_name, diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py index 8c15ce1820..7df5b1ce1b 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py @@ -5,7 +5,9 @@ import torch from _dist_test_utils import get_device_counts from torch.export import export -from tensorrt_llm._torch.auto_deploy.custom_ops.trtllm_dist import is_trtllm_op_available +from tensorrt_llm._torch.auto_deploy.custom_ops.distributed.trtllm_dist import ( + is_trtllm_op_available, +) from tensorrt_llm._torch.auto_deploy.distributed.common import initialize_or_skip from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py index 6248174516..0bf32996a5 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py @@ -22,7 +22,7 @@ import torch import torch.nn as nn # Ensure custom ops are registered -from tensorrt_llm._torch.auto_deploy.custom_ops import rms_norm # noqa: F401 +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization import rms_norm # noqa: F401 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_attention_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_attention_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_flashinfer_attention_op.py similarity index 99% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_flashinfer_attention_op.py index aa2095f2f0..314d471e2d 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_flashinfer_attention_op.py @@ -3,7 +3,9 @@ import pytest import torch from torch_attention_reference import TorchAttentionReference -from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_attention import _GlobalFlashInferPlanner +from tensorrt_llm._torch.auto_deploy.custom_ops.attention.flashinfer_attention import ( + _GlobalFlashInferPlanner, +) def _create_combined_kv_cache(k_cache: torch.Tensor, v_cache: torch.Tensor) -> torch.Tensor: diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_torch_attention_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_torch_attention_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_attention_with_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_triton_attention_with_kv_cache.py similarity index 99% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_attention_with_kv_cache.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_triton_attention_with_kv_cache.py index d20ec16c7d..371637c5aa 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_attention_with_kv_cache.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_triton_attention_with_kv_cache.py @@ -7,7 +7,7 @@ import triton from _custom_op_utils import torch_rope_reference from _model_test_utils import repeat_kv -from tensorrt_llm._torch.auto_deploy.custom_ops.triton_kernels.attention_with_kv_cache import ( +from tensorrt_llm._torch.auto_deploy.custom_ops.attention.triton_attention_with_kv_cache import ( attention_kv_stage1, attention_kv_stage2, context_attention_kv, diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_cuda_causal_conv_cached_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_cuda_causal_conv_cached_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py similarity index 98% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_mamba_cached_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py index 1ebe3a948a..f9e92638d3 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_mamba_cached_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py @@ -2,7 +2,7 @@ import pytest import torch import tensorrt_llm._torch.auto_deploy # noqa: F401 -from tests.unittest._torch.auto_deploy.unit.singlegpu.custom_ops.test_triton_mamba_cached_op import ( +from tests.unittest._torch.auto_deploy.unit.singlegpu.custom_ops.mamba.test_triton_mamba_cached_op import ( _random_params, ) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_causal_conv_cached_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_causal_conv_cached_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_mamba_cached_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_mamba_cached_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_triton_mamba_cached_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_triton_mamba_cached_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_ad_moe_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_ad_moe_op.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_ad_moe_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_ad_moe_op.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_triton_moe.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_moe.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_triton_moe.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_trtllm_moe.py similarity index 99% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_trtllm_moe.py index c41bf1a601..d85166d9c9 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_trtllm_moe.py @@ -13,7 +13,7 @@ from torch.nn import functional as F from utils.util import skip_pre_hopper import tensorrt_llm._torch.auto_deploy.custom_ops # noqa: F401 -from tensorrt_llm._torch.auto_deploy.custom_ops.quant import ( +from tensorrt_llm._torch.auto_deploy.custom_ops.quantization.quant import ( TRTLLM_NVFP4_COLUMN_SIZE, TRTLLM_NVFP4_ROW_SIZE, TRTLLM_NVFP4_SCALING_VECTOR_SIZE, diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_flashinfer_fused_add_rms_norm_op.py similarity index 93% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_flashinfer_fused_add_rms_norm_op.py index f6d67afb04..d6bc714eed 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_flashinfer_fused_add_rms_norm_op.py @@ -1,7 +1,7 @@ import pytest import torch -from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_fused_add_rms_norm import ( +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.flashinfer_fused_add_rms_norm import ( flashinfer_fused_add_rms_norm, ) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_mamba_rms_norm.py similarity index 93% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_mamba_rms_norm.py index 59952a6c89..0917090f64 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_mamba_rms_norm.py @@ -1,7 +1,7 @@ import pytest import torch -from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import ( +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import ( gated_rms_norm_ref, triton_rmsnorm_gated, ) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_triton_rms_norm.py similarity index 76% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_triton_rms_norm.py index 78b45cfd4a..4720f0876a 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_triton_rms_norm.py @@ -1,7 +1,7 @@ import torch -from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import * # noqa -from tensorrt_llm._torch.auto_deploy.custom_ops.triton_kernels.rms_norm import rms_norm +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import * # noqa +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.triton_rms_norm import rms_norm def test_rmsnorm_triton_op(): diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/quantization/test_quant.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/quantization/test_quant.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_rope_op_variants.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_rope_op_variants.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_rope_op_variants.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_rope_op_variants.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_rope.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_triton_rope.py similarity index 100% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_rope.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_triton_rope.py diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py index 5c3bc1714c..34baf42d43 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py @@ -1,6 +1,6 @@ import torch -from tensorrt_llm._torch.auto_deploy.custom_ops.torch_attention import update_kv_cache +from tensorrt_llm._torch.auto_deploy.custom_ops.attention.torch_attention import update_kv_cache def test_update_kv_cache(): diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/utils/test_triton_utils.py similarity index 98% rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/utils/test_triton_utils.py index de684fb6f8..4d68ec9d54 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/utils/test_triton_utils.py @@ -4,7 +4,7 @@ import pytest import torch # Import to register the custom op -from tensorrt_llm._torch.auto_deploy.custom_ops import triton_utils # noqa: F401 +from tensorrt_llm._torch.auto_deploy.custom_ops.utils import triton_utils # noqa: F401 def _reference_gather_scatter( diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py index e52b8e4392..ce8c0d50d6 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py @@ -3,7 +3,7 @@ import torch from _graph_test_helpers import run_test_transformed_gm from torch.export import Dim -from tensorrt_llm._torch.auto_deploy.custom_ops.l2norm import * # noqa +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.l2norm import * # noqa from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py index c62e5d5396..3d83eb9151 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py @@ -3,7 +3,7 @@ import torch from _graph_test_helpers import run_test_transformed_gm from torch.export import Dim -from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import * # noqa +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import * # noqa from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py index 561fe34338..1fd19e6cdc 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py @@ -23,8 +23,8 @@ import torch from torch.export import Dim # Import modules to register custom ops (torch.ops.auto_deploy.*) -import tensorrt_llm._torch.auto_deploy.custom_ops.torch_attention # noqa: F401 -import tensorrt_llm._torch.auto_deploy.custom_ops.torch_rope # noqa: F401 +import tensorrt_llm._torch.auto_deploy.custom_ops.attention.torch_attention # noqa: F401 +import tensorrt_llm._torch.auto_deploy.custom_ops.rope.torch_rope # noqa: F401 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py index 6926e980c9..dc353e4993 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py @@ -1,8 +1,8 @@ import torch from torch.export import Dim -from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_fused_add_rms_norm import * # noqa -from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import * # noqa +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.flashinfer_fused_add_rms_norm import * # noqa +from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import * # noqa from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py index 4fb9cc1359..a3a9ad80ac 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py @@ -1,7 +1,7 @@ import pytest import torch -from tensorrt_llm._torch.auto_deploy.custom_ops.quant import FP8_MAX +from tensorrt_llm._torch.auto_deploy.custom_ops.quantization.quant import FP8_MAX from tensorrt_llm._torch.auto_deploy.transform.interface import TransformConfig from tensorrt_llm._torch.auto_deploy.transform.library.quantization import ( FP8LinearQuantizationFromConfig,