From d160439ef92910c09bd18dd9d30ee789551ddd6d Mon Sep 17 00:00:00 2001
From: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com>
Date: Thu, 5 Feb 2026 21:32:22 -0800
Subject: [PATCH] [#11148][feat] AutoDeploy: Better structure the custom op
 (#11152)

Signed-off-by: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com>
---
 .../_torch/auto_deploy/custom_ops/__init__.py | 15 ++++++++
 .../custom_ops/attention/__init__.py          | 36 +++++++++++++++++++
 .../{ => attention}/flashinfer_attention.py   | 27 ++++++++++----
 .../{ => attention}/onnx_attention.py         |  5 +--
 .../{ => attention}/torch_attention.py        | 15 ++++++++
 .../torch_backend_attention.py                | 23 +++++++++---
 .../{ => attention}/triton_attention.py       | 25 ++++++++++---
 .../triton_attention_with_kv_cache.py}        | 15 ++++++++
 .../custom_ops/attention_interface.py         | 15 ++++++++
 .../custom_ops/distributed/__init__.py        | 26 ++++++++++++++
 .../{ => distributed}/torch_dist.py           | 17 ++++++++-
 .../{ => distributed}/trtllm_dist.py          | 23 +++++++++---
 .../auto_deploy/custom_ops/fla/__init__.py    | 14 ++++++++
 .../custom_ops/fla/delta_rule/__init__.py     | 14 ++++++++
 .../custom_ops/fla/fla_backend_delta.py       | 15 ++++++++
 .../auto_deploy/custom_ops/fla/fla_delta.py   | 15 ++++++++
 .../custom_ops/fused_moe/__init__.py          | 14 ++++++++
 .../custom_ops/fused_moe/load_moe_align.py    | 15 ++++++++
 .../custom_ops/fused_moe/mxfp4_moe.py         | 15 ++++++++
 .../custom_ops/fused_moe/torch_moe.py         | 15 ++++++++
 .../custom_ops/fused_moe/triton_moe.py        | 15 ++++++++
 .../custom_ops/fused_moe/trtllm_moe.py        |  4 ++-
 .../auto_deploy/custom_ops/linear/__init__.py | 26 ++++++++++++++
 .../custom_ops/{ => linear}/linear.py         | 15 ++++++++
 .../custom_ops/{ => linear}/torch_router.py   | 15 ++++++++
 .../auto_deploy/custom_ops/mamba/__init__.py  | 14 ++++++++
 .../mamba/flashinfer_backend_mamba.py         | 15 ++++++++
 .../mamba/torch_backend_causal_conv.py        | 15 ++++++++
 .../custom_ops/mamba/torch_backend_mamba.py   | 15 ++++++++
 .../custom_ops/mamba/torch_causal_conv.py     | 15 ++++++++
 .../custom_ops/mamba/torch_mamba.py           | 15 ++++++++
 .../auto_deploy/custom_ops/mla/__init__.py    | 24 +++++++++++++
 .../auto_deploy/custom_ops/{ => mla}/mla.py   | 21 +++++++++--
 .../custom_ops/normalization/__init__.py      | 30 ++++++++++++++++
 .../flashinfer_fused_add_rms_norm.py          |  2 +-
 .../custom_ops/{ => normalization}/l2norm.py  | 15 ++++++++
 .../{ => normalization}/rms_norm.py           | 21 +++++++++--
 .../triton_rms_norm.py}                       | 15 ++++++++
 .../custom_ops/quantization/__init__.py       | 26 ++++++++++++++
 .../custom_ops/{ => quantization}/quant.py    | 17 ++++++++-
 .../{ => quantization}/torch_quant.py         | 15 ++++++++
 .../auto_deploy/custom_ops/rope/__init__.py   | 30 ++++++++++++++++
 .../custom_ops/{ => rope}/flashinfer_rope.py  | 15 ++++++++
 .../custom_ops/{ => rope}/torch_rope.py       | 15 ++++++++
 .../custom_ops/{ => rope}/triton_rope.py      | 17 ++++++++-
 .../rope.py => rope/triton_rope_kernel.py}    | 15 ++++++++
 .../custom_ops/torch_libs/__init__.py         | 14 ++++++++
 .../custom_ops/triton_kernels/__init__.py     |  0
 .../auto_deploy/custom_ops/utils/__init__.py  | 26 ++++++++++++++
 .../{ => utils}/torch_gather_logits.py        | 15 ++++++++
 .../custom_ops/{ => utils}/triton_utils.py    | 15 ++++++++
 .../models/custom/modeling_nemotron_h.py      |  2 +-
 .../transform/library/fused_add_rms_norm.py   |  2 +-
 .../transform/library/fused_moe.py            |  5 ++-
 .../transform/library/quantization.py         |  2 +-
 .../auto_deploy/transform/library/rms_norm.py |  2 +-
 .../auto_deploy/transform/library/sharding.py |  2 +-
 .../auto_deploy/utils/quantization_utils.py   |  2 +-
 .../test_allreduce_residual_rmsnorm_fusion.py |  4 ++-
 .../library/test_rmsnorm_sharding.py          |  2 +-
 .../{ => attention}/test_attention_op.py      |  0
 .../test_flashinfer_attention_op.py           |  4 ++-
 .../test_torch_attention_op.py                |  0
 .../test_triton_attention_with_kv_cache.py}   |  2 +-
 .../test_cuda_causal_conv_cached_op.py        |  0
 .../test_flashinfer_mamba_cached_op.py        |  2 +-
 .../test_torch_causal_conv_cached_op.py       |  0
 .../{ => mamba}/test_torch_mamba_cached_op.py |  0
 .../test_triton_mamba_cached_op.py            |  0
 .../custom_ops/{ => moe}/test_ad_moe_op.py    |  0
 .../test_triton_moe.py                        |  0
 .../custom_ops/{ => moe}/test_trtllm_moe.py   |  2 +-
 .../test_flashinfer_fused_add_rms_norm_op.py  |  2 +-
 .../test_mamba_rms_norm.py                    |  2 +-
 .../test_triton_rms_norm.py                   |  4 +--
 .../{ => quantization}/test_quant.py          |  0
 .../{ => rope}/test_rope_op_variants.py       |  0
 .../test_rope.py => rope/test_triton_rope.py} |  0
 .../custom_ops/test_update_kv_cache.py        |  2 +-
 .../test_triton_utils.py                      |  2 +-
 .../library/test_fuse_l2norm.py               |  2 +-
 .../library/test_fuse_rmsnorm.py              |  2 +-
 .../library/test_fuse_rope_attention.py       |  4 +--
 .../library/test_fused_add_rms_norm.py        |  4 +--
 .../utils/test_quantization_utils.py          |  2 +-
 85 files changed, 872 insertions(+), 58 deletions(-)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/flashinfer_attention.py (95%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/onnx_attention.py (97%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/torch_attention.py (96%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/torch_backend_attention.py (95%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => attention}/triton_attention.py (93%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{triton_kernels/attention_with_kv_cache.py => attention/triton_attention_with_kv_cache.py} (97%)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => distributed}/torch_dist.py (72%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => distributed}/trtllm_dist.py (82%)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => linear}/linear.py (55%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => linear}/torch_router.py (68%)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => mla}/mla.py (91%)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => normalization}/flashinfer_fused_add_rms_norm.py (97%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => normalization}/l2norm.py (60%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => normalization}/rms_norm.py (92%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{triton_kernels/rms_norm.py => normalization/triton_rms_norm.py} (67%)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => quantization}/quant.py (94%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => quantization}/torch_quant.py (95%)
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => rope}/flashinfer_rope.py (77%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => rope}/torch_rope.py (85%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => rope}/triton_rope.py (83%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{triton_kernels/rope.py => rope/triton_rope_kernel.py} (89%)
 delete mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/__init__.py
 create mode 100644 tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => utils}/torch_gather_logits.py (70%)
 rename tensorrt_llm/_torch/auto_deploy/custom_ops/{ => utils}/triton_utils.py (80%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => attention}/test_attention_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => attention}/test_flashinfer_attention_op.py (99%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => attention}/test_torch_attention_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels/test_attention_with_kv_cache.py => attention/test_triton_attention_with_kv_cache.py} (99%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_cuda_causal_conv_cached_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_flashinfer_mamba_cached_op.py (98%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_torch_causal_conv_cached_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_torch_mamba_cached_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => mamba}/test_triton_mamba_cached_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => moe}/test_ad_moe_op.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels => moe}/test_triton_moe.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => moe}/test_trtllm_moe.py (99%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => normalization}/test_flashinfer_fused_add_rms_norm_op.py (93%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => normalization}/test_mamba_rms_norm.py (93%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels => normalization}/test_triton_rms_norm.py (76%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => quantization}/test_quant.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{ => rope}/test_rope_op_variants.py (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels/test_rope.py => rope/test_triton_rope.py} (100%)
 rename tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/{triton_kernels => utils}/test_triton_utils.py (98%)

diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py
index 8098a61987..ecedb47ee9 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops and make sure they are all registered."""
 
 import importlib
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py
new file mode 100644
index 0000000000..ead9ba122d
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/__init__.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Attention operations.
+
+This module provides various attention implementations and backends:
+- torch_attention: PyTorch reference implementations
+- torch_backend_attention: PyTorch-based attention backend
+- flashinfer_attention: FlashInfer-based optimized attention
+- triton_attention: Triton-based attention implementations
+- triton_attention_with_kv_cache: Triton attention with KV cache support
+- triton_attention_with_paged_kv_cache: Triton attention with paged KV cache
+- onnx_attention: Placeholder ops for ONNX export of attention mechanisms
+"""
+
+__all__ = [
+    "torch_attention",
+    "torch_backend_attention",
+    "flashinfer_attention",
+    "triton_attention",
+    "triton_attention_with_kv_cache",
+    "triton_attention_with_paged_kv_cache",
+    "onnx_attention",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py
similarity index 95%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py
index 4767ad82a8..4183f5148c 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass, fields
 from typing import Dict, List, Literal, Optional, Tuple, Union
 
@@ -7,12 +22,12 @@ from torch._ops import OpOverloadPacket
 from torch._subclasses import FakeTensor
 from torch.fx import Node
 
-from ....llmapi.llm_args import KvCacheConfig
-from ...flashinfer_utils import get_env_enable_pdl
-from ..utils.cuda_graph import cuda_graph_state
-from ..utils.logger import ad_logger
-from ..utils.node_utils import extract_op_args
-from .attention_interface import (
+from .....llmapi.llm_args import KvCacheConfig
+from ....flashinfer_utils import get_env_enable_pdl
+from ...utils.cuda_graph import cuda_graph_state
+from ...utils.logger import ad_logger
+from ...utils.node_utils import extract_op_args
+from ..attention_interface import (
     AttentionDescriptor,
     AttentionLayout,
     AttentionRegistry,
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/onnx_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/onnx_attention.py
similarity index 97%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/onnx_attention.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/onnx_attention.py
index 6e3de6149e..c729c27fe4 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/onnx_attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/onnx_attention.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_attention.py
similarity index 96%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_attention.py
index 7fdacdb35e..da76b1e52e 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_attention.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Torch reference implementations for attention."""
 
 import math
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_backend_attention.py
similarity index 95%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_backend_attention.py
index ad806c68ae..a8f68574c5 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/torch_backend_attention.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Torch backend attention using pure PyTorch reference implementations."""
 
 import math
@@ -8,10 +23,10 @@ from torch._ops import OpOverloadPacket
 from torch._subclasses import FakeTensor
 from torch.fx import Node
 
-from ....llmapi.llm_args import KvCacheConfig
-from ..utils.logger import ad_logger
-from ..utils.node_utils import extract_op_args
-from .attention_interface import (
+from .....llmapi.llm_args import KvCacheConfig
+from ...utils.logger import ad_logger
+from ...utils.node_utils import extract_op_args
+from ..attention_interface import (
     AttentionDescriptor,
     AttentionLayout,
     AttentionRegistry,
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention.py
similarity index 93%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention.py
index d99e428c76..70eb07e50d 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops for MHA/XQA attention."""
 
 import math
@@ -9,10 +24,10 @@ from torch._ops import OpOverloadPacket
 from torch._subclasses import FakeTensor
 from torch.fx import Node
 
-from ....llmapi.llm_args import KvCacheConfig
-from ..utils.logger import ad_logger
-from ..utils.node_utils import extract_op_args
-from .attention_interface import (
+from .....llmapi.llm_args import KvCacheConfig
+from ...utils.logger import ad_logger
+from ...utils.node_utils import extract_op_args
+from ..attention_interface import (
     AttentionDescriptor,
     AttentionLayout,
     AttentionRegistry,
@@ -21,7 +36,7 @@ from .attention_interface import (
     ResourceHandlerDict,
     UnpagedResourceHandler,
 )
-from .triton_kernels.attention_with_kv_cache import (
+from .triton_attention_with_kv_cache import (
     attention_kv_stage2,
     context_attention_kv_flattened,
     gqa_attention_kv_stage1,
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention_with_kv_cache.py
similarity index 97%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention_with_kv_cache.py
index d93c6a0464..15372e7bab 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/attention_with_kv_cache.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_attention_with_kv_cache.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Multi-head attention kernel that can operate with kv-caches."""
 
 import triton
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
index 2169585849..4af2891c0a 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Attention Interface to handle various attention operators and cache operations.
 
 This module provides an interface between the high-level runtime and cache management system and
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py
new file mode 100644
index 0000000000..9fa04a2089
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Distributed operations.
+
+This module provides distributed communication primitives:
+- torch_dist: PyTorch distributed backend operations
+- trtllm_dist: TensorRT-LLM optimized distributed operations
+"""
+
+__all__ = [
+    "torch_dist",
+    "trtllm_dist",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/torch_dist.py
similarity index 72%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_dist.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/torch_dist.py
index 5cbda61970..5e618ecafd 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_dist.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/torch_dist.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops required for implementing tensor parallelism.
 
 This module defines atomic distributed ops - each op uses a specific backend
@@ -8,7 +23,7 @@ from typing import List, Optional
 
 import torch
 
-from ..distributed import common as dist
+from ...distributed import common as dist
 
 # ============================================================================
 # PyTorch Distributed Backend Ops (demollm mode)
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py
similarity index 82%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py
index 1f4b9229ca..a213ddb0d9 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """TRT-LLM distributed operations and fused kernels.
 
 This module defines atomic TRT-LLM-specific ops that use optimized kernels.
@@ -9,10 +24,10 @@ from typing import List, Optional
 import torch
 
 # use trtllm distributed ops to improve TP performance if possible
-from ....mapping import Mapping
-from ...distributed import AllReduce, allgather
-from ...modules.linear import AllReduceFusionOp, AllReduceParams, AllReduceStrategy
-from ..distributed.common import ReduceOp, get_rank_world_size, get_world_size, is_ompi
+from .....mapping import Mapping
+from ....distributed import AllReduce, allgather
+from ....modules.linear import AllReduceFusionOp, AllReduceParams, AllReduceStrategy
+from ...distributed.common import ReduceOp, get_rank_world_size, get_world_size, is_ompi
 
 # Cache AllReduce modules to avoid recreating on every call
 # This is critical for CUDA graph compatibility - recreating modules during
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py
index e69de29bb2..ea14aa9647 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py
index e69de29bb2..ea14aa9647 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/delta_rule/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py
index 5c52410f24..6026dfe4d5 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Cached attention op for delta rule using the fla kernel library.
 
 Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py
index 22d3445535..9cbe89a664 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops corresponding to fla's chunked delta rule.
 
 Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py
index e69de29bb2..ea14aa9647 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py
index 2d0ee66cfc..3ff67e4378 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 AOT-compiled moe_align CUDA kernel.
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
index c0a55d6c23..91842847e8 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Triton-kernels-based MXFP4 MoE ops (GPT-OSS style) with routing, swizzling, and fused activation
 
 from typing import Callable, Tuple
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py
index df99815f97..dd963c2fc1 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/torch_moe.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Callable, List
 
 import torch
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py
index d33b752532..713d7dba03 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/triton_moe.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Triton implementation of the Fused MOE ops. Inspired by vLLM's triton MOE implementation.
 """
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
index 7a7a53d960..57f3392a26 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
@@ -15,7 +15,9 @@
 
 import torch
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.quant import TRTLLM_NVFP4_SCALING_VECTOR_SIZE
+from tensorrt_llm._torch.auto_deploy.custom_ops.quantization.quant import (
+    TRTLLM_NVFP4_SCALING_VECTOR_SIZE,
+)
 from tensorrt_llm._torch.utils import ActivationType
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py
new file mode 100644
index 0000000000..b11ccc7ee6
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Linear operations.
+
+This module provides linear layer implementations:
+- linear: Linear layer operations
+- torch_router: MoE router operations
+"""
+
+__all__ = [
+    "linear",
+    "torch_router",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/linear.py
similarity index 55%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/linear/linear.py
index 214626ad24..4a7a04adef 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/linear.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops for linear layers."""
 
 from typing import Optional
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_router.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/torch_router.py
similarity index 68%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_router.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/linear/torch_router.py
index 57613c5e77..2cfc4a508d 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_router.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/linear/torch_router.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.nn.functional as F
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py
index e69de29bb2..ea14aa9647 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py
index f3ec5314f1..15d46a329d 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/flashinfer_backend_mamba.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List
 
 import torch
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py
index 2d2aab3b2c..75b73088a9 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom op collection for cached causal conv1d in pure PyTorch.
 
 This mirrors the structure used by the cached Mamba/SSM ops:
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py
index dfd3cb07ce..7d9e04abc3 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_mamba.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom op collection for cached mamba2 ssm transform (linear attention) in pure PyTorch.
 
 This file contains two kinds of functionality:
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py
index a53f984012..28b15388fb 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_causal_conv.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom op collection for uncached causal conv (sliding window with 1d)."""
 
 from typing import Optional
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py
index 752520a74a..dbec15699e 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_mamba.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom op collection for uncached mamba mixer (linear attention)."""
 
 from typing import List, Tuple
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py
new file mode 100644
index 0000000000..b2c4737b67
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multi-head Latent Attention operations.
+
+This module provides Multi-head Latent Attention (MLA) implementations:
+- mla: MLA operations and attention descriptor
+"""
+
+__all__ = [
+    "mla",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/mla.py
similarity index 91%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/mla/mla.py
index e8673ae20e..f435fc5818 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/mla.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/mla/mla.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops for MultiHead Latent attention."""
 
 import math
@@ -7,8 +22,9 @@ import torch
 from torch._ops import OpOverloadPacket
 from torch.fx import Node
 
-from ....llmapi.llm_args import KvCacheConfig
-from .attention_interface import (
+from .....llmapi.llm_args import KvCacheConfig
+from ..attention.triton_attention import _decode_attention, _prefill_attention
+from ..attention_interface import (
     AttentionDescriptor,
     AttentionLayout,
     AttentionRegistry,
@@ -16,7 +32,6 @@ from .attention_interface import (
     ResourceHandlerDict,
     UnpagedResourceHandler,
 )
-from .triton_attention import _decode_attention, _prefill_attention
 
 Constant = Union[int, float, str, None]
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py
new file mode 100644
index 0000000000..f9a763fc2f
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/__init__.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Normalization operations.
+
+This module provides various normalization implementations:
+- rms_norm: RMSNorm implementations (FlashInfer, Triton, reference)
+- triton_rms_norm: Low-level Triton RMSNorm kernel
+- l2norm: L2 normalization operations
+- flashinfer_fused_add_rms_norm: Fused add + RMSNorm operation
+"""
+
+__all__ = [
+    "rms_norm",
+    "triton_rms_norm",
+    "l2norm",
+    "flashinfer_fused_add_rms_norm",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/flashinfer_fused_add_rms_norm.py
similarity index 97%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/flashinfer_fused_add_rms_norm.py
index d7a183ce90..61b73a77c5 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_fused_add_rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/flashinfer_fused_add_rms_norm.py
@@ -12,7 +12,7 @@
 import flashinfer
 import torch
 
-from ...flashinfer_utils import get_env_enable_pdl
+from ....flashinfer_utils import get_env_enable_pdl
 
 
 @torch.library.custom_op(
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/l2norm.py
similarity index 60%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/l2norm.py
index 28380c346d..6e030a1362 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/l2norm.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom ops corresponding to l2norm."""
 
 import torch
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/rms_norm.py
similarity index 92%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/rms_norm.py
index 2f85d87518..45a7080d5a 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/rms_norm.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Custom operator for FlashInfer and Triton RMSNorm implementation."""
 
 import flashinfer
@@ -6,9 +21,9 @@ import torch.distributed as dist
 import torch.nn.functional as F
 from einops import rearrange
 
-from ...flashinfer_utils import get_env_enable_pdl
-from ...modules.mamba.layernorm_gated import _layer_norm_fwd
-from .triton_kernels.rms_norm import rms_norm
+from ....flashinfer_utils import get_env_enable_pdl
+from ....modules.mamba.layernorm_gated import _layer_norm_fwd
+from .triton_rms_norm import rms_norm
 
 
 @torch.library.custom_op("auto_deploy::flashinfer_rms_norm", mutates_args=())
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/triton_rms_norm.py
similarity index 67%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rms_norm.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/triton_rms_norm.py
index 70c67ac80a..6c1d3774f3 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/normalization/triton_rms_norm.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import triton
 import triton.language as tl
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py
new file mode 100644
index 0000000000..e968c77643
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Quantization operations.
+
+This module provides quantization utilities and operations:
+- quant: Quantization operations (FP8, FP4, INT4, INT8)
+- torch_quant: PyTorch-based quantization implementations
+"""
+
+__all__ = [
+    "quant",
+    "torch_quant",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/quant.py
similarity index 94%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/quant.py
index cfb4049923..ff5e1133c0 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/quant.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Definition of the quant module that can be used for PTQ."""
 
 import warnings
@@ -7,7 +22,7 @@ import torch
 from flashinfer import bmm_fp8
 from torch import nn
 
-from .torch_libs.float8_python_api import addmm_float8_unwrapped
+from ..torch_libs.float8_python_api import addmm_float8_unwrapped
 
 TRTLLM_FP4_OP_AVAILABLE = True
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/torch_quant.py
similarity index 95%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/torch_quant.py
index 78a1fe5d83..4ad8dd7b8e 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quantization/torch_quant.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List, Optional
 
 import torch
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py
new file mode 100644
index 0000000000..3f1cf25763
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/__init__.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RoPE (Rotary Position Embedding) operations.
+
+This module provides various RoPE implementations:
+- torch_rope: PyTorch reference implementation
+- flashinfer_rope: FlashInfer-based optimized RoPE
+- triton_rope: Triton-based RoPE implementation
+- triton_rope_kernel: Low-level Triton kernels for RoPE
+"""
+
+__all__ = [
+    "torch_rope",
+    "flashinfer_rope",
+    "triton_rope",
+    "triton_rope_kernel",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/flashinfer_rope.py
similarity index 77%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_rope.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/flashinfer_rope.py
index e4f329eeec..defe5fca5f 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_rope.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/flashinfer_rope.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Tuple
 
 import flashinfer
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/torch_rope.py
similarity index 85%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_rope.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/torch_rope.py
index da769158b6..ce35e51b85 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_rope.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/torch_rope.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Tuple
 
 import torch
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope.py
similarity index 83%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_rope.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope.py
index b9282d3038..3c5d79c0f0 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_rope.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope.py
@@ -1,7 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import triton
 
-from .triton_kernels.rope import rope_fwd_flattened_kernel, rope_fwd_kernel
+from .triton_rope_kernel import rope_fwd_flattened_kernel, rope_fwd_kernel
 
 
 @torch.library.custom_op("auto_deploy::triton_rope_with_input_pos", mutates_args=())
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rope.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope_kernel.py
similarity index 89%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rope.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope_kernel.py
index dc3701edd2..4139d3a9d9 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/rope.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rope/triton_rope_kernel.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import triton
 import triton.language as tl
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py
index e69de29bb2..ea14aa9647 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_libs/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_kernels/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py
new file mode 100644
index 0000000000..bbd45886bc
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility operations.
+
+This module provides utility functions and helpers:
+- torch_gather_logits: Logit gathering operations
+- triton_utils: Triton utility functions and helpers
+"""
+
+__all__ = [
+    "torch_gather_logits",
+    "triton_utils",
+]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_gather_logits.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/torch_gather_logits.py
similarity index 70%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/torch_gather_logits.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/utils/torch_gather_logits.py
index 7669ea8966..e86097d687 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_gather_logits.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/torch_gather_logits.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/triton_utils.py
similarity index 80%
rename from tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py
rename to tensorrt_llm/_torch/auto_deploy/custom_ops/utils/triton_utils.py
index f65a570bc1..64c2b01aa3 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/triton_utils.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/utils/triton_utils.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Triton utility operations for auto_deploy."""
 
 import torch
diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
index 259f997afd..b71a0da2f6 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
@@ -31,7 +31,7 @@ from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import ModelOutput
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import gated_rms_norm_ref
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import gated_rms_norm_ref
 from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
 from tensorrt_llm._torch.utils import ActivationType
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py
index d0bfeee09b..103578ca01 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_add_rms_norm.py
@@ -16,7 +16,7 @@ from typing import Tuple
 import torch
 from torch.fx import GraphModule
 
-from ...custom_ops.flashinfer_fused_add_rms_norm import flashinfer_fused_add_rms_norm
+from ...custom_ops.normalization.flashinfer_fused_add_rms_norm import flashinfer_fused_add_rms_norm
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py
index f7dc25e0f0..94dc64892c 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py
@@ -9,7 +9,10 @@ from torch.fx import GraphModule, Node
 
 from tensorrt_llm._torch.utils import ActivationType
 
-from ...custom_ops.quant import TRTLLM_NVFP4_PACKING_FACTOR, TRTLLM_NVFP4_SCALING_VECTOR_SIZE
+from ...custom_ops.quantization.quant import (
+    TRTLLM_NVFP4_PACKING_FACTOR,
+    TRTLLM_NVFP4_SCALING_VECTOR_SIZE,
+)
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ...utils._graph import delete_all_unused_submodules, eliminate_dead_code, get_attr_by_name
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
index a9254b1294..d86c8244a5 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -6,7 +6,7 @@ import torch
 import torch.nn as nn
 from torch.fx import GraphModule, Node
 
-from ...custom_ops.quant import (
+from ...custom_ops.quantization.quant import (
     FP4_GLOBAL_SCALE_MAX,
     FP8_MAX,
     TRTLLM_NVFP4_COLUMN_SIZE,
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
index 5e06842989..9c5c5247f4 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
@@ -6,7 +6,7 @@ import torch
 from pydantic import Field
 from torch.fx import GraphModule, Node
 
-from ...custom_ops.rms_norm import gated_rms_norm_ref
+from ...custom_ops.normalization.rms_norm import gated_rms_norm_ref
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
index f93b506201..9d6139dfec 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -30,7 +30,7 @@ from pydantic import BaseModel, Field, field_validator
 from torch.fx import GraphModule, Node
 
 from .....functional import AllReduceStrategy
-from ...custom_ops.trtllm_dist import is_trtllm_op_available
+from ...custom_ops.distributed.trtllm_dist import is_trtllm_op_available
 from ...models.factory import ModelFactory, ShardingConfigSource
 from ...shim.interface import CachedSequenceInterface
 from ...utils._graph import del_attr_by_name, eliminate_dead_code
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
index 889b06edb0..dd32386f50 100644
--- a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
+++ b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
@@ -5,7 +5,7 @@ import torch
 import torch.nn.functional as F
 from torch.fx import GraphModule, Node
 
-from ..custom_ops.quant import FP4_GLOBAL_SCALE_MAX, FP8_MAX
+from ..custom_ops.quantization.quant import FP4_GLOBAL_SCALE_MAX, FP8_MAX
 from .logger import ad_logger
 from .node_utils import (
     extract_weight_name,
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py
index 8c15ce1820..7df5b1ce1b 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py
@@ -5,7 +5,9 @@ import torch
 from _dist_test_utils import get_device_counts
 from torch.export import export
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.trtllm_dist import is_trtllm_op_available
+from tensorrt_llm._torch.auto_deploy.custom_ops.distributed.trtllm_dist import (
+    is_trtllm_op_available,
+)
 from tensorrt_llm._torch.auto_deploy.distributed.common import initialize_or_skip
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py
index 6248174516..0bf32996a5 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_rmsnorm_sharding.py
@@ -22,7 +22,7 @@ import torch
 import torch.nn as nn
 
 # Ensure custom ops are registered
-from tensorrt_llm._torch.auto_deploy.custom_ops import rms_norm  # noqa: F401
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization import rms_norm  # noqa: F401
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_attention_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_attention_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_attention_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_flashinfer_attention_op.py
similarity index 99%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_flashinfer_attention_op.py
index aa2095f2f0..314d471e2d 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_flashinfer_attention_op.py
@@ -3,7 +3,9 @@ import pytest
 import torch
 from torch_attention_reference import TorchAttentionReference
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_attention import _GlobalFlashInferPlanner
+from tensorrt_llm._torch.auto_deploy.custom_ops.attention.flashinfer_attention import (
+    _GlobalFlashInferPlanner,
+)
 
 
 def _create_combined_kv_cache(k_cache: torch.Tensor, v_cache: torch.Tensor) -> torch.Tensor:
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_torch_attention_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_attention_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_torch_attention_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_attention_with_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_triton_attention_with_kv_cache.py
similarity index 99%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_attention_with_kv_cache.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_triton_attention_with_kv_cache.py
index d20ec16c7d..371637c5aa 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_attention_with_kv_cache.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/attention/test_triton_attention_with_kv_cache.py
@@ -7,7 +7,7 @@ import triton
 from _custom_op_utils import torch_rope_reference
 from _model_test_utils import repeat_kv
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.triton_kernels.attention_with_kv_cache import (
+from tensorrt_llm._torch.auto_deploy.custom_ops.attention.triton_attention_with_kv_cache import (
     attention_kv_stage1,
     attention_kv_stage2,
     context_attention_kv,
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_cuda_causal_conv_cached_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_cuda_causal_conv_cached_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_cuda_causal_conv_cached_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py
similarity index 98%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_mamba_cached_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py
index 1ebe3a948a..f9e92638d3 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_mamba_cached_op.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py
@@ -2,7 +2,7 @@ import pytest
 import torch
 
 import tensorrt_llm._torch.auto_deploy  # noqa: F401
-from tests.unittest._torch.auto_deploy.unit.singlegpu.custom_ops.test_triton_mamba_cached_op import (
+from tests.unittest._torch.auto_deploy.unit.singlegpu.custom_ops.mamba.test_triton_mamba_cached_op import (
     _random_params,
 )
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_causal_conv_cached_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_causal_conv_cached_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_causal_conv_cached_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_mamba_cached_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_torch_mamba_cached_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_torch_mamba_cached_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_triton_mamba_cached_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_triton_mamba_cached_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/mamba/test_triton_mamba_cached_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_ad_moe_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_ad_moe_op.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_ad_moe_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_ad_moe_op.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_triton_moe.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_moe.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_triton_moe.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_trtllm_moe.py
similarity index 99%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_trtllm_moe.py
index c41bf1a601..d85166d9c9 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/moe/test_trtllm_moe.py
@@ -13,7 +13,7 @@ from torch.nn import functional as F
 from utils.util import skip_pre_hopper
 
 import tensorrt_llm._torch.auto_deploy.custom_ops  # noqa: F401
-from tensorrt_llm._torch.auto_deploy.custom_ops.quant import (
+from tensorrt_llm._torch.auto_deploy.custom_ops.quantization.quant import (
     TRTLLM_NVFP4_COLUMN_SIZE,
     TRTLLM_NVFP4_ROW_SIZE,
     TRTLLM_NVFP4_SCALING_VECTOR_SIZE,
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_flashinfer_fused_add_rms_norm_op.py
similarity index 93%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_flashinfer_fused_add_rms_norm_op.py
index f6d67afb04..d6bc714eed 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_fused_add_rms_norm_op.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_flashinfer_fused_add_rms_norm_op.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_fused_add_rms_norm import (
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.flashinfer_fused_add_rms_norm import (
     flashinfer_fused_add_rms_norm,
 )
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_mamba_rms_norm.py
similarity index 93%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_mamba_rms_norm.py
index 59952a6c89..0917090f64 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_mamba_rms_norm.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import (
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import (
     gated_rms_norm_ref,
     triton_rmsnorm_gated,
 )
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_triton_rms_norm.py
similarity index 76%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_triton_rms_norm.py
index 78b45cfd4a..4720f0876a 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/normalization/test_triton_rms_norm.py
@@ -1,7 +1,7 @@
 import torch
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import *  # noqa
-from tensorrt_llm._torch.auto_deploy.custom_ops.triton_kernels.rms_norm import rms_norm
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import *  # noqa
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.triton_rms_norm import rms_norm
 
 
 def test_rmsnorm_triton_op():
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/quantization/test_quant.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/quantization/test_quant.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_rope_op_variants.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_rope_op_variants.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_rope_op_variants.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_rope_op_variants.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_rope.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_triton_rope.py
similarity index 100%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_rope.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/rope/test_triton_rope.py
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py
index 5c3bc1714c..34baf42d43 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_update_kv_cache.py
@@ -1,6 +1,6 @@
 import torch
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.torch_attention import update_kv_cache
+from tensorrt_llm._torch.auto_deploy.custom_ops.attention.torch_attention import update_kv_cache
 
 
 def test_update_kv_cache():
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/utils/test_triton_utils.py
similarity index 98%
rename from tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py
rename to tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/utils/test_triton_utils.py
index de684fb6f8..4d68ec9d54 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_utils.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/utils/test_triton_utils.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 
 # Import to register the custom op
-from tensorrt_llm._torch.auto_deploy.custom_ops import triton_utils  # noqa: F401
+from tensorrt_llm._torch.auto_deploy.custom_ops.utils import triton_utils  # noqa: F401
 
 
 def _reference_gather_scatter(
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py
index e52b8e4392..ce8c0d50d6 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_l2norm.py
@@ -3,7 +3,7 @@ import torch
 from _graph_test_helpers import run_test_transformed_gm
 from torch.export import Dim
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.l2norm import *  # noqa
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.l2norm import *  # noqa
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py
index c62e5d5396..3d83eb9151 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py
@@ -3,7 +3,7 @@ import torch
 from _graph_test_helpers import run_test_transformed_gm
 from torch.export import Dim
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import *  # noqa
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import *  # noqa
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py
index 561fe34338..1fd19e6cdc 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rope_attention.py
@@ -23,8 +23,8 @@ import torch
 from torch.export import Dim
 
 # Import modules to register custom ops (torch.ops.auto_deploy.*)
-import tensorrt_llm._torch.auto_deploy.custom_ops.torch_attention  # noqa: F401
-import tensorrt_llm._torch.auto_deploy.custom_ops.torch_rope  # noqa: F401
+import tensorrt_llm._torch.auto_deploy.custom_ops.attention.torch_attention  # noqa: F401
+import tensorrt_llm._torch.auto_deploy.custom_ops.rope.torch_rope  # noqa: F401
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py
index 6926e980c9..dc353e4993 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fused_add_rms_norm.py
@@ -1,8 +1,8 @@
 import torch
 from torch.export import Dim
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_fused_add_rms_norm import *  # noqa
-from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import *  # noqa
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.flashinfer_fused_add_rms_norm import *  # noqa
+from tensorrt_llm._torch.auto_deploy.custom_ops.normalization.rms_norm import *  # noqa
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py
index 4fb9cc1359..a3a9ad80ac 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/utils/test_quantization_utils.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from tensorrt_llm._torch.auto_deploy.custom_ops.quant import FP8_MAX
+from tensorrt_llm._torch.auto_deploy.custom_ops.quantization.quant import FP8_MAX
 from tensorrt_llm._torch.auto_deploy.transform.interface import TransformConfig
 from tensorrt_llm._torch.auto_deploy.transform.library.quantization import (
     FP8LinearQuantizationFromConfig,