From 24ac86c48549fb42fdcc29bf6a0de883dd8d03f0 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Wed, 28 Jan 2026 19:56:32 -0800 Subject: [PATCH] [https://nvbugs/5761391][fix] Include triton-kernels as a packaged dependency (#10471) Signed-off-by: Anish Shanbhag --- .github/CODEOWNERS | 6 + .pre-commit-config.yaml | 3 + ATTRIBUTIONS-Python.md | 36 +- examples/models/core/gpt_oss/README.md | 27 +- pyproject.toml | 8 +- requirements.txt | 2 +- scripts/vendor_triton_kernels.py | 200 ++++++ setup.py | 6 + tensorrt_llm/__init__.py | 37 ++ .../custom_ops/fused_moe/mxfp4_moe.py | 37 +- .../transform/library/mxfp4_moe.py | 7 +- .../modules/fused_moe/fused_moe_triton.py | 43 +- .../_torch/modules/fused_moe/quantization.py | 2 + tensorrt_llm/_torch/modules/triton_linear.py | 14 +- .../defs/accuracy/references/gsm8k.yaml | 4 - .../defs/accuracy/test_llm_api_pytorch.py | 126 ++-- .../multigpu/custom_ops/test_mxfp4_moe_ep.py | 14 +- .../_torch/modeling/test_modeling_gpt_oss.py | 10 +- .../unittest/_torch/modules/test_fused_moe.py | 68 +- .../_torch/modules/test_triton_linear.py | 28 +- .../others/test_triton_kernels_vendoring.py | 73 +++ tests/unittest/utils/util.py | 3 + triton_kernels/LICENSE | 23 + triton_kernels/README.md | 24 + triton_kernels/VERSION | 3 + triton_kernels/__init__.py | 3 + triton_kernels/compaction.py | 73 +++ triton_kernels/compaction_details/__init__.py | 1 + .../compaction_details/_masked_compaction.py | 24 + triton_kernels/matmul_ogs.py | 613 ++++++++++++++++++ triton_kernels/matmul_ogs_details/__init__.py | 1 + triton_kernels/matmul_ogs_details/_common.py | 169 +++++ .../matmul_ogs_details/_matmul_ogs.py | 433 +++++++++++++ .../matmul_ogs_details/_p_matmul_ogs.py | 475 ++++++++++++++ .../matmul_ogs_details/_reduce_grouped.py | 98 +++ .../matmul_ogs_details/opt_flags.py | 307 +++++++++ .../opt_flags_details/__init__.py | 1 + .../opt_flags_details/opt_flags_amd.py | 37 ++ .../opt_flags_details/opt_flags_nvidia.py | 115 ++++ triton_kernels/numerics.py | 46 ++ triton_kernels/numerics_details/__init__.py | 3 + triton_kernels/numerics_details/flexpoint.py | 194 ++++++ triton_kernels/numerics_details/mxfp.py | 307 +++++++++ .../numerics_details/mxfp_details/__init__.py | 1 + .../mxfp_details/_downcast_to_mxfp.py | 162 +++++ .../mxfp_details/_upcast_from_mxfp.py | 129 ++++ triton_kernels/proton_opts.py | 21 + triton_kernels/reduction_details/__init__.py | 1 + .../reduction_details/reduce_bitmatrix.py | 115 ++++ triton_kernels/routing.py | 396 +++++++++++ triton_kernels/routing_details/__init__.py | 1 + triton_kernels/routing_details/_expt_data.py | 68 ++ .../routing_details/_routing_compute.py | 152 +++++ triton_kernels/specialize.py | 139 ++++ triton_kernels/swiglu.py | 104 +++ triton_kernels/swiglu_details/__init__.py | 1 + triton_kernels/swiglu_details/_swiglu.py | 106 +++ triton_kernels/target_info.py | 58 ++ triton_kernels/tensor.py | 219 +++++++ triton_kernels/tensor_details/__init__.py | 1 + triton_kernels/tensor_details/layout.py | 44 ++ .../tensor_details/layout_details/__init__.py | 1 + .../tensor_details/layout_details/base.py | 23 + .../layout_details/blackwell_scale.py | 62 ++ .../layout_details/blackwell_value.py | 37 ++ .../layout_details/cdna4_scale.py | 48 ++ .../layout_details/hopper_scale.py | 84 +++ .../layout_details/hopper_value.py | 327 ++++++++++ .../tensor_details/layout_details/strided.py | 21 + triton_kernels/testing.py | 199 ++++++ triton_kernels/topk.py | 128 ++++ triton_kernels/topk_details/__init__.py | 3 + triton_kernels/topk_details/_topk_backward.py | 55 ++ triton_kernels/topk_details/_topk_forward.py | 150 +++++ 74 files changed, 6312 insertions(+), 248 deletions(-) create mode 100755 scripts/vendor_triton_kernels.py create mode 100644 tests/unittest/others/test_triton_kernels_vendoring.py create mode 100644 triton_kernels/LICENSE create mode 100644 triton_kernels/README.md create mode 100644 triton_kernels/VERSION create mode 100644 triton_kernels/__init__.py create mode 100644 triton_kernels/compaction.py create mode 100644 triton_kernels/compaction_details/__init__.py create mode 100644 triton_kernels/compaction_details/_masked_compaction.py create mode 100644 triton_kernels/matmul_ogs.py create mode 100644 triton_kernels/matmul_ogs_details/__init__.py create mode 100644 triton_kernels/matmul_ogs_details/_common.py create mode 100644 triton_kernels/matmul_ogs_details/_matmul_ogs.py create mode 100644 triton_kernels/matmul_ogs_details/_p_matmul_ogs.py create mode 100644 triton_kernels/matmul_ogs_details/_reduce_grouped.py create mode 100644 triton_kernels/matmul_ogs_details/opt_flags.py create mode 100644 triton_kernels/matmul_ogs_details/opt_flags_details/__init__.py create mode 100644 triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_amd.py create mode 100644 triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py create mode 100644 triton_kernels/numerics.py create mode 100644 triton_kernels/numerics_details/__init__.py create mode 100644 triton_kernels/numerics_details/flexpoint.py create mode 100644 triton_kernels/numerics_details/mxfp.py create mode 100644 triton_kernels/numerics_details/mxfp_details/__init__.py create mode 100644 triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py create mode 100644 triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py create mode 100644 triton_kernels/proton_opts.py create mode 100644 triton_kernels/reduction_details/__init__.py create mode 100644 triton_kernels/reduction_details/reduce_bitmatrix.py create mode 100644 triton_kernels/routing.py create mode 100644 triton_kernels/routing_details/__init__.py create mode 100644 triton_kernels/routing_details/_expt_data.py create mode 100644 triton_kernels/routing_details/_routing_compute.py create mode 100644 triton_kernels/specialize.py create mode 100644 triton_kernels/swiglu.py create mode 100644 triton_kernels/swiglu_details/__init__.py create mode 100644 triton_kernels/swiglu_details/_swiglu.py create mode 100644 triton_kernels/target_info.py create mode 100644 triton_kernels/tensor.py create mode 100644 triton_kernels/tensor_details/__init__.py create mode 100644 triton_kernels/tensor_details/layout.py create mode 100644 triton_kernels/tensor_details/layout_details/__init__.py create mode 100644 triton_kernels/tensor_details/layout_details/base.py create mode 100644 triton_kernels/tensor_details/layout_details/blackwell_scale.py create mode 100644 triton_kernels/tensor_details/layout_details/blackwell_value.py create mode 100644 triton_kernels/tensor_details/layout_details/cdna4_scale.py create mode 100644 triton_kernels/tensor_details/layout_details/hopper_scale.py create mode 100644 triton_kernels/tensor_details/layout_details/hopper_value.py create mode 100644 triton_kernels/tensor_details/layout_details/strided.py create mode 100644 triton_kernels/testing.py create mode 100644 triton_kernels/topk.py create mode 100644 triton_kernels/topk_details/__init__.py create mode 100644 triton_kernels/topk_details/_topk_backward.py create mode 100644 triton_kernels/topk_details/_topk_forward.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2c9d30a6c8..df16355955 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -217,6 +217,12 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers ## Any changes to versions, additions, or removals of third-party libraries /3rdparty/** @NVIDIA/trt-llm-oss-compliance +### Vendored Third-Party Code (triton-kernels) +## This is a temporary vendored copy of triton-kernels from the Triton project (MIT License). +## Do not accept contributions to this directory - it should only be updated via scripts/vendor_triton_kernels.py +## This can be removed if and when triton-kernels is published as a separate wheel. +/triton_kernels/** @NVIDIA/trt-llm-oss-compliance + ### Docker & Installation Scripts ## These scripts install and pin dependency versions /docker/common/** @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs @NVIDIA/trt-llm-oss-compliance diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d62d7533ec..4e075798ca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1366,6 +1366,9 @@ common-files: &common_files | triton_backend/tools/whisper/client.py | )$ +# Global exclude pattern for vendored third-party code +exclude: '^triton_kernels/' + default_install_hook_types: [pre-commit, commit-msg] repos: - repo: https://github.com/pycqa/isort diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index 614efb21de..8df48d3642 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -62379,7 +62379,7 @@ Copyright 2018- The Hugging Face team. All rights reserved. - `Homepage`: https://github.com/huggingface/transformers -## triton (3.5.0) +## triton (3.5.1) ### Licenses License: `MIT License` @@ -62417,6 +62417,40 @@ License: `MIT License` - `Homepage`: https://github.com/triton-lang/triton/ +## triton-kernels (3.5.1) + +### Licenses +License: `MIT License` + + - `LICENSE` (from triton repository root): +``` +Copyright 2018-2020 Philippe Tillet +Copyright 2020-2022 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files +(the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, +and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +### URLs + - `Source`: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels + + ## tritonclient (2.63.0) ### Licenses diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md index 85cb21f6eb..22761c10e9 100644 --- a/examples/models/core/gpt_oss/README.md +++ b/examples/models/core/gpt_oss/README.md @@ -107,33 +107,10 @@ Once again, the function call works successfully, this time using a different fu ## Using OpenAI Triton Kernels for MoE -OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels; enable them with the steps below: -1. **Build and install Triton** (tested with the commit below): +OpenAI ships a set of Triton kernels optimized for its MoE models. -```bash -git clone https://github.com/triton-lang/triton.git -cd triton -# Specific commit verified with TensorRT-LLM -git checkout f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f -pip install -r python/requirements.txt # build-time dependencies -pip install wheel build -python3 setup.py bdist_wheel -pip install ./dist/*.whl -``` - -2. **Expose the Triton kernels to TensorRT-LLM** - The kernels are not packaged in the wheel, so set the environment variable `TRITON_ROOT` to your Triton clone: - -```bash -export TRITON_ROOT=/local/user/triton -# TensorRT-LLM expects the kernels at: -# $TRITON_ROOT/python/triton_kernels -``` - -3. **Select Triton as the MoE backend** - -• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--config`: +To use the Triton MoE backend with **trtllm-serve** (or other similar commands), add this snippet to the YAML file passed via `--config`: ```yaml moe_config: diff --git a/pyproject.toml b/pyproject.toml index e50b3e1425..c20e658283 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,13 +10,15 @@ build-backend = "setuptools.build_meta" #################################################################################################### [tool.isort] line_length = 80 +known_first_party = ["tensorrt_llm"] +known_third_party = ["triton_kernels"] [tool.yapf] based_on_style = "pep8" column_limit = 80 [tool.codespell] -skip = ".git,3rdparty,tests/integration/test_input_files**,**.jsonl,**.json" +skip = ".git,3rdparty,triton_kernels,tests/integration/test_input_files**,**.jsonl,**.json" exclude-file = "examples/models/core/whisper/tokenizer.py" ignore-words-list = "rouge,inout,atleast,strat,nd,subtile,thrid,improbe,NotIn,te,iteract,anythin,tru,Tracin,vEw,dOut" @@ -42,6 +44,7 @@ fix = true # orders of magnitude faster, so we should move to deprecate `yapf`. exclude = [ "**3rdparty/**", + "triton_kernels/**", ".devcontainer/make_env.py", ".github/scripts/label_community_user.py", ".github/scripts/pr_checklist_check.py", @@ -1458,6 +1461,7 @@ convention = "google" [tool.ruff.lint.isort] known-first-party = ["tensorrt_llm"] +known-third-party = ["triton_kernels"] split-on-trailing-comma = false @@ -1490,7 +1494,7 @@ disallow_untyped_calls = false disallow_incomplete_defs = false disallow_untyped_defs = false warn_return_any = false -exclude = [] +exclude = ["triton_kernels"] [[tool.mypy.overrides]] diff --git a/requirements.txt b/requirements.txt index ffe5b2d2de..a7c27c2978 100644 --- a/requirements.txt +++ b/requirements.txt @@ -66,7 +66,7 @@ ninja etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a blake3 soundfile -triton==3.5.1 +triton==3.5.1 # NOTE: if you update this, you must also run scripts/vendor_triton_kernels.py to vendor the new version of triton_kernels tiktoken blobfile openai-harmony==0.0.4 diff --git a/scripts/vendor_triton_kernels.py b/scripts/vendor_triton_kernels.py new file mode 100755 index 0000000000..e50096ed4d --- /dev/null +++ b/scripts/vendor_triton_kernels.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +"""Script to vendor triton-kernels into TensorRT-LLM. + +This script: +1. Clones the Triton repo at a specific tag to a temp directory +2. Copies the triton_kernels module to the repo root as a top-level package +3. Adds attribution headers to all Python files +4. Copies the LICENSE file from Triton +5. Creates a VERSION file to track the vendored version +6. Creates a README.md with clear copyright attribution + +To update to a new version: + python scripts/vendor_triton_kernels.py --tag v3.6.0 +""" + +import argparse +import shutil +import subprocess +import tempfile +from pathlib import Path + +REPO_ROOT = Path(__file__).parent.parent.resolve() +TRITON_REPO_URL = "https://github.com/triton-lang/triton.git" +TRITON_KERNELS_MODULE_PATH = "python/triton_kernels/triton_kernels" +DEST_PATH = REPO_ROOT / "triton_kernels" + +VENDORED_NOTICE = "# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY." +ATTRIBUTION_HEADER = f"""\ +{VENDORED_NOTICE} +# Source: https://github.com/triton-lang/triton/tree/{{tag}}/{{original_file}} +# Triton is licensed under the MIT License. +""" + + +def clone_triton(tag: str, dest_dir: str) -> tuple[Path, Path]: + """Clone the Triton repo at the specified tag. Returns (module_path, repo_root).""" + print(f"Cloning Triton repo at tag {tag}...") + + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", tag, TRITON_REPO_URL, dest_dir], + check=True, + capture_output=True, + text=True, + ) + + repo_root = Path(dest_dir) + triton_kernels_module_path = repo_root / TRITON_KERNELS_MODULE_PATH + if not triton_kernels_module_path.exists(): + raise RuntimeError(f"triton_kernels module not found at {triton_kernels_module_path}") + + return triton_kernels_module_path, repo_root + + +def add_attribution_header(file_path: Path, tag: str, original_rel_path: str) -> None: + content = file_path.read_text() + + # Handle shebang and encoding declarations + lines = content.split("\n") + insert_pos = 0 + preserved_lines = [] + + for i, line in enumerate(lines): + if line.startswith("#!") or line.startswith("# -*-") or line.startswith("# coding"): + preserved_lines.append(line) + insert_pos = i + 1 + else: + break + + header = ATTRIBUTION_HEADER.format(tag=tag, original_file=original_rel_path) + + new_content = "\n".join(preserved_lines) + if preserved_lines: + new_content += "\n" + new_content += header + + # Add blank line between header and content if file has content + remaining_content = "\n".join(lines[insert_pos:]) + if remaining_content.strip(): + new_content += "\n" + new_content += remaining_content + + file_path.write_text(new_content) + + +def copy_triton_kernels(src_path: Path, dest_path: Path, tag: str) -> list[str]: + """Copy triton_kernels module to destination and add attribution headers.""" + print(f"Copying triton_kernels to {dest_path}...") + + if dest_path.exists(): + print(f" Removing existing {dest_path}") + shutil.rmtree(dest_path) + + shutil.copytree(src_path, dest_path) + + # Add attribution headers to all existing Python files + python_files = [] + for py_file in dest_path.rglob("*.py"): + rel_path = py_file.relative_to(dest_path) + original_rel_path = f"{TRITON_KERNELS_MODULE_PATH}/{rel_path}" + python_files.append(str(rel_path)) + add_attribution_header(py_file, tag, original_rel_path) + + # Create __init__.py files in subdirs that don't have them. + # Triton's upstream code relies on implicit namespace packages (PEP 420), but + # setuptools' find_packages() requires __init__.py to discover subpackages. + for subdir in dest_path.rglob("*"): + if subdir.is_dir(): + init_file = subdir / "__init__.py" + if not init_file.exists(): + print(f" Creating {init_file.relative_to(dest_path)}") + init_file.write_text(f"{VENDORED_NOTICE}\n") + + print(f" Copied triton_kernels module to {dest_path}") + return python_files + + +def copy_license(triton_repo_root: Path, dest_path: Path) -> None: + """Copy the Triton LICENSE file.""" + print("Copying LICENSE file...") + + license_src = triton_repo_root / "LICENSE" + license_dest = dest_path / "LICENSE" + + shutil.copy2(license_src, license_dest) + print(f" Copied LICENSE to {license_dest}") + + +def create_version_file(dest_path: Path, tag: str) -> None: + """Create a VERSION file to track which version was vendored.""" + print("Creating VERSION file...") + + version_file = dest_path / "VERSION" + version_content = f"""{tag} +# This file tracks the version of triton-kernels that was vendored. +# To update, run: python scripts/vendor_triton_kernels.py --tag +""" + + version_file.write_text(version_content) + print(f" Created {version_file}") + + +def create_readme(dest_path: Path, tag: str) -> None: + """Create a README.md with clear copyright attribution.""" + print("Creating README.md...") + + readme_file = dest_path / "README.md" + readme_content = f"""# Vendored triton_kernels + +This directory contains code vendored from the [Triton](https://github.com/triton-lang/triton) project. + +| | | +|---|---| +| **Copyright** | The Triton Authors | +| **License** | MIT (see [LICENSE](LICENSE) file in this directory) | +| **Source** | https://github.com/triton-lang/triton/tree/{tag}/python/triton_kernels/triton_kernels | +| **Version** | `{tag}` | + +## Attribution + +This code is the work of the Triton authors and is included here under the MIT License. +Each Python file includes an attribution header indicating its origin. + +## Do Not Edit + +This code is vendored verbatim and should not be modified directly. +To update to a newer version, run: + +```bash +python scripts/vendor_triton_kernels.py --tag +``` +""" + + readme_file.write_text(readme_content) + print(f" Created {readme_file}") + + +def main(): + parser = argparse.ArgumentParser(description="Vendor triton-kernels into TensorRT-LLM") + parser.add_argument( + "--tag", + required=True, + help="Triton git tag to vendor from. See the list of tags at https://github.com/triton-lang/triton/tags", + ) + args = parser.parse_args() + + print(f"Vendoring triton-kernels from Triton {args.tag}") + print(f"Destination: {DEST_PATH}") + + with tempfile.TemporaryDirectory() as tmp_dir: + triton_kernels_src, triton_repo_root = clone_triton(args.tag, tmp_dir) + copy_triton_kernels(triton_kernels_src, DEST_PATH, args.tag) + copy_license(triton_repo_root, DEST_PATH) + create_version_file(DEST_PATH, args.tag) + create_readme(DEST_PATH, args.tag) + + print("SUCCESS: triton-kernels has been vendored.") + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 83aec22469..795d668999 100644 --- a/setup.py +++ b/setup.py @@ -365,6 +365,11 @@ else: # Ensure rawref is included package_data.append('runtime/kv_cache_manager_v2/rawref/*.so') +# Add vendored triton_kernels as an explicit top-level package. +# This is vendored from the Triton project and kept at repo root so its +# internal absolute imports (e.g., "from triton_kernels.foo import bar") work. +packages += find_packages(include=["triton_kernels", "triton_kernels.*"]) + # https://setuptools.pypa.io/en/latest/references/keywords.html setup( name='tensorrt_llm', @@ -392,6 +397,7 @@ setup( keywords="nvidia tensorrt deeplearning inference", package_data={ 'tensorrt_llm': package_data, + 'triton_kernels': ['LICENSE', 'VERSION', 'README.md'], }, license_files=get_license(), entry_points={ diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py index cea56431b7..7f4a25dd83 100644 --- a/tensorrt_llm/__init__.py +++ b/tensorrt_llm/__init__.py @@ -61,6 +61,43 @@ def _preload_python_lib(): _preload_python_lib() import sys +from pathlib import Path + + +def _setup_vendored_triton_kernels(): + """Ensure our vendored triton_kernels takes precedence over any existing installation. + + Some environments bundle triton_kernels, which can conflict with our vendored version. This function: + 1. Clears any pre-loaded triton_kernels from sys.modules + 2. Temporarily adds our package root to sys.path + 3. Imports triton_kernels (caching our version in sys.modules) + 4. Removes the package root from sys.path + """ + + # Clear any pre-loaded triton_kernels from cache + for mod in list(sys.modules.keys()): + if mod == "triton_kernels" or mod.startswith("triton_kernels."): + del sys.modules[mod] + + # Temporarily add our package root to sys.path + root = Path(__file__).parent.parent + + vendored = root / "triton_kernels" + if not vendored.exists(): + raise RuntimeError( + f"Vendored triton_kernels module not found at {vendored}") + + should_add_to_path = str(root) not in sys.path + if should_add_to_path: + sys.path.insert(0, str(root)) + + import triton_kernels # noqa: F401 + + if should_add_to_path: + sys.path.remove(str(root)) + + +_setup_vendored_triton_kernels() # Need to import torch before tensorrt_llm library, otherwise some shared binary files # cannot be found for the public PyTorch, raising errors like: diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py index d47d44378d..c0a55d6c23 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/mxfp4_moe.py @@ -4,36 +4,15 @@ from typing import Callable, Tuple import torch import torch.nn.functional as F +from triton_kernels.matmul_ogs import FlexCtx, FnSpecs, FusedActivation, PrecisionConfig, matmul_ogs +from triton_kernels.numerics import InFlexData +from triton_kernels.routing import RoutingData, routing +from triton_kernels.swiglu import swiglu_fn +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout +from triton_kernels.tensor_details.layout import StridedLayout -IS_TRITON_KERNELS_AVAILABLE = True -TRITON_KERNELS_UNAVAILABLE_REASON = "" - -try: - from triton_kernels.matmul_ogs import ( - FlexCtx, - FnSpecs, - FusedActivation, - PrecisionConfig, - matmul_ogs, - ) - from triton_kernels.numerics import InFlexData - from triton_kernels.routing import RoutingData, routing - from triton_kernels.swiglu import swiglu_fn - from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor - from triton_kernels.tensor_details import layout - from triton_kernels.tensor_details.layout import StridedLayout - - from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter - -except Exception as _e: - IS_TRITON_KERNELS_AVAILABLE = False - TRITON_KERNELS_UNAVAILABLE_REASON = f"{type(_e).__name__}: {_e}" - - FlexCtx = FnSpecs = FusedActivation = PrecisionConfig = matmul_ogs = None - InFlexData = RoutingData = routing = swiglu_fn = None - FP4 = convert_layout = wrap_torch_tensor = None - layout = StridedLayout = None - TritonEPRouter = None +from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter # copied from transformers.integrations.mxfp4::swizzle_mxfp4 with minor modification diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py index b60a9f875c..4113732111 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/mxfp4_moe.py @@ -9,7 +9,6 @@ from tensorrt_llm._torch.auto_deploy.utils.pattern_matcher import ( register_ad_pattern, ) -from ...custom_ops.fused_moe.mxfp4_moe import IS_TRITON_KERNELS_AVAILABLE from ...utils.module import get_submodule_of_param from ...utils.node_utils import is_op from ..interface import BaseTransform, TransformInfo, TransformRegistry @@ -220,11 +219,7 @@ class InsertMXFP4MLP(BaseTransform): shared_config, ) -> Tuple[GraphModule, TransformInfo]: qcfg = factory.get_quant_config() - if ( - not qcfg - or qcfg.get("quant_method", "") != self.algo_name - or not IS_TRITON_KERNELS_AVAILABLE - ): + if not qcfg or qcfg.get("quant_method", "") != self.algo_name: return gm, TransformInfo( skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True ) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py index 22d95411bc..c256b1313c 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py @@ -1,32 +1,19 @@ from __future__ import annotations import os -import sys from typing import Dict, List, NamedTuple, Optional import torch import torch.nn as nn import triton import triton.language as tl - -IS_TRITON_KERNELS_AVAILABLE = False -# We expect to find triton_kernels under $TRITON_ROOT/python/triton_kernels -# Triton upstream commit f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f has been verified. -triton_root = os.getenv('TRITON_ROOT') -if triton_root: - triton_root = os.path.abspath( - os.path.join(triton_root, 'python', 'triton_kernels')) - if os.path.exists(triton_root) and triton_root not in sys.path: - sys.path.insert(0, triton_root) - assert triton.__version__ >= "3.4.0", "Triton kernels are detected but the Triton wheel is too old" - import triton_kernels.swiglu - from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation, - PrecisionConfig, matmul_ogs) - from triton_kernels.numerics import InFlexData - from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch - from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor - from triton_kernels.tensor_details import layout - IS_TRITON_KERNELS_AVAILABLE = True +import triton_kernels.swiglu +from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation, + PrecisionConfig, matmul_ogs) +from triton_kernels.numerics import InFlexData +from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout from ...model_config import ModelConfig from ..linear import TensorParallelMode, load_weight_shard @@ -214,11 +201,16 @@ class TritonUnquantizedFusedMoEMethod(FusedMoEMethodBase): module.intermediate_size_per_partition, module.hidden_size, ) + # Bias shapes use the output dimension (last dim) of the transposed weight shapes + w3_w1_bias_shape = (w3_w1_weight_shape[0], w3_w1_weight_shape[2]) + w2_bias_shape = (w2_weight_shape[0], w2_weight_shape[2]) super().create_weights(module, weight_dtype, w3_w1_weight_shape, w2_weight_shape, - bias_dtype=torch.float32) + bias_dtype=torch.float32, + w3_w1_bias_shape=w3_w1_bias_shape, + w2_bias_shape=w2_bias_shape) self.setup_quant_scales(module) def setup_quant_scales(self, module: torch.nn.Module): @@ -404,12 +396,17 @@ class TritonFP8QDQFusedMoEMethod(TritonUnquantizedFusedMoEMethod): module.intermediate_size_per_partition, module.hidden_size, ) + # Bias shapes use the output dimension (last dim) of the transposed weight shapes + w3_w1_bias_shape = (w3_w1_weight_shape[0], w3_w1_weight_shape[2]) + w2_bias_shape = (w2_weight_shape[0], w2_weight_shape[2]) FusedMoEMethodBase.create_weights(self, module, weight_dtype, w3_w1_weight_shape, w2_weight_shape, - bias_dtype=torch.float32) + bias_dtype=torch.float32, + w3_w1_bias_shape=w3_w1_bias_shape, + w2_bias_shape=w2_bias_shape) fc31_dequant = nn.Parameter(torch.empty( module.expert_size_per_partition, dtype=torch.float32), @@ -1295,8 +1292,6 @@ class TritonFusedMoE(MoE): weight_loading_mode=weight_loading_mode, layer_idx=layer_idx, ) - if not IS_TRITON_KERNELS_AVAILABLE: - raise ImportError("Triton kernels are not available.") if torch.cuda.get_device_capability()[0] != 9 and self.ep_size > 1: raise NotImplementedError( "TritonFusedMoE is only supported on Hopper with EP size > 1.") diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index 40d26a9766..ac66e479af 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -623,6 +623,8 @@ def load_activation_scales_fp8_qdq(module: torch.nn.Module, weights: Dict): load_expert_fc2_input_scale_fp8_qdq(w2_input_scale, tmp_fc2_input_scale[expert_id]) + return tmp_fc31_input_scale.max(), tmp_fc2_input_scale.max() + def requantize_expert_w3_w1_weight_fp8_qdq(module: torch.nn.Module, w1_weight_scale, w3_weight_scale, diff --git a/tensorrt_llm/_torch/modules/triton_linear.py b/tensorrt_llm/_torch/modules/triton_linear.py index dfc3d584e6..f22f35bd0c 100644 --- a/tensorrt_llm/_torch/modules/triton_linear.py +++ b/tensorrt_llm/_torch/modules/triton_linear.py @@ -4,20 +4,15 @@ from typing import Dict, List, Optional import torch from torch.nn.parameter import Parameter +from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig, matmul_ogs +from triton_kernels.numerics import InFlexData from tensorrt_llm._torch.peft.lora.layer import LoraLayer from tensorrt_llm.mapping import Mapping from ...models.modeling_utils import QuantConfig -# Reuse the common Triton import setup -from .fused_moe.fused_moe_triton import (IS_TRITON_KERNELS_AVAILABLE, - maybe_update_stride, +from .fused_moe.fused_moe_triton import (maybe_update_stride, swizzle_weight_and_scale) - -if IS_TRITON_KERNELS_AVAILABLE: - from triton_kernels.matmul_ogs import (FlexCtx, PrecisionConfig, matmul_ogs) - from triton_kernels.numerics import InFlexData - from .linear import (Linear, LinearMethodBase, TensorParallelMode, WeightsLoadingConfig, copy_weight, load_weight_shard, load_weights_fused_gate_up_helper, @@ -383,9 +378,6 @@ class TritonLinear(Linear): use_custom_cublas_mm: bool = False, lora: Optional[LoraLayer] = None, ): - if not IS_TRITON_KERNELS_AVAILABLE: - raise ImportError("Triton kernels are not available. " - "Please install the required dependencies.") assert not use_custom_cublas_mm, "TritonLinear does not support custom cublas mm." super().__init__( diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 7cc38a742c..4313ba15f3 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -253,10 +253,6 @@ microsoft/phi-4: accuracy: 90.64 mistralai/Codestral-22B-v0.1: - accuracy: 67.10 -GPT-OSS/BF16: - - accuracy: 90.3 - - kv_cache_quant_algo: FP8 - accuracy: 90.3 GPT-OSS/120B-MXFP4: - accuracy: 90.3 - spec_dec_algo: Eagle diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 6fb3f2f7dc..13f94c87cd 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -51,8 +51,6 @@ from defs.conftest import get_sm_version, is_sm_100f from tensorrt_llm import LLM from tensorrt_llm._torch.model_config import MoeLoadBalancerConfig -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig, DeepSeekSparseAttentionConfig, Eagle3DecodingConfig, KvCacheConfig, MoeConfig, @@ -3977,7 +3975,10 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"]) + @pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", + pytest.param("TRITON", marks=skip_no_hopper), "TRTLLM"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (1, 1, 1, False, True, True), @@ -4008,11 +4009,6 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("TRITON moe backend is not available.") - if get_sm_version() < 90: - pytest.skip("TRITON moe backend requires Hopper or newer.") if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100: pytest.skip( "CUTLASS or TRTLLM moe backend requires Blackwell or newer.") @@ -4467,11 +4463,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [ (True, True), ]) @@ -4480,8 +4477,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) - if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -4518,11 +4513,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (4, 1, 1, False, True, True), @@ -4551,10 +4547,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): patch_mpi_pool_session_for_env(mocker, {"ENABLE_CONFIGURABLE_MOE": env_value}) - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 @@ -4612,11 +4604,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): extra_evaluator_kwargs=extra_evaluator_kwargs) @pytest.mark.skip_less_device(8) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (8, 1, 1, False, True, True), @@ -4629,9 +4622,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -4653,6 +4643,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): extra_evaluator_kwargs=self.extra_evaluator_kwargs) @pytest.mark.skip_less_device(4) + @skip_no_hopper @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) @@ -4667,14 +4658,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4") - pytorch_config = dict( - disable_overlap_scheduler=not overlap_scheduler, - cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5, dtype=kv_cache_dtype) @@ -4683,11 +4668,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - **pytorch_config, + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, enable_attention_dp=attention_dp, moe_config=MoeConfig(backend="TRITON")) with llm: - model_name = "GPT-OSS/BF16" + model_name = "GPT-OSS/120B-MXFP4" task = GSM8K(model_name) task.evaluate(llm, extra_evaluator_kwargs=self.extra_evaluator_kwargs) @@ -4696,11 +4682,12 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ (2, 1, 1, False, True, True), @@ -4711,10 +4698,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) @@ -4783,16 +4766,13 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 @@ -4852,17 +4832,14 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): ids=["overlap_scheduler", "no_overlap_scheduler"]) @pytest.mark.parametrize("one_model", [True, False], ids=["one_model", "two_model"]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - if get_sm_version() == 90: pytest.skip( "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4" @@ -5044,17 +5021,14 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): ids=["overlap_scheduler", "no_overlap_scheduler"]) @pytest.mark.parametrize("one_model", [True, False], ids=["one_model", "two_model"]) - @pytest.mark.parametrize( - "moe_backend", - ["CUTLASS", - pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], - ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize("moe_backend", [ + "CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), + pytest.param("TRITON", marks=skip_no_hopper) + ], + ids=["cutlass", "trtllm", "triton"]) def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler, mocker): - if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py index a3e9b3254c..12ef18218a 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py @@ -4,11 +4,8 @@ import pytest import torch import torch.distributed as dist from _dist_test_utils import get_device_counts -from utils.util import getSMVersion +from utils.util import skip_no_hopper -from tensorrt_llm._torch.auto_deploy.custom_ops.fused_moe.mxfp4_moe import ( - IS_TRITON_KERNELS_AVAILABLE, -) from tensorrt_llm._torch.auto_deploy.distributed.common import spawn_multiprocess_job @@ -110,14 +107,7 @@ def _run_mxfp4_mlp_ep_dtype_test(num_experts: int, topk: int, rank: int, world_s torch.testing.assert_close(part_out, ref_out, rtol=5e-2, atol=5e-2, equal_nan=True) -@pytest.mark.skipif( - getSMVersion() != 90, - reason="triton_mxfp4_moe is only supported in Hopper architecture", -) -@pytest.mark.skipif( - not IS_TRITON_KERNELS_AVAILABLE, - reason="triton_kernels unavailable", -) +@skip_no_hopper @pytest.mark.parametrize("num_experts", [6, 8]) @pytest.mark.parametrize("topk", [4]) # must be <= num_experts @pytest.mark.parametrize("device_count", get_device_counts()) diff --git a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py index 0e2b974593..7d6bcc29e9 100644 --- a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py +++ b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py @@ -5,10 +5,9 @@ import shutil import pytest from transformers import AutoTokenizer from utils.llm_data import llm_models_root +from utils.util import skip_no_hopper from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig configs = """ @@ -48,11 +47,10 @@ def dump_config_json(dst_dir): json.dump(json_configs, f, indent=2, ensure_ascii=False) -@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"]) +@pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", pytest.param("TRITON", marks=skip_no_hopper)]) def test_gpt_oss_trtllmgen(moe_backend): - if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - prompts = [ "How are you?", "Hello, my name is", diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 773218a40c..30e64a16f2 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -19,9 +19,8 @@ from mpi4py import MPI from mpi4py.futures import MPIPoolExecutor from transformers.configuration_utils import PretrainedConfig from utils.util import (check_accuracy, skip_blackwell, skip_blackwell_geforce, - skip_neither_ada_nor_hopper_unittest, - skip_non_hopper_unittest, skip_pre_blackwell, - skip_pre_hopper) + skip_neither_ada_nor_hopper_unittest, skip_no_hopper, + skip_pre_blackwell, skip_pre_hopper) from tensorrt_llm._torch.autotuner import AutoTuner, autotune from tensorrt_llm._torch.model_config import ModelConfig @@ -41,8 +40,6 @@ from tensorrt_llm._torch.modules.fused_moe import ( from tensorrt_llm._torch.modules.fused_moe.quantization import \ NVFP4CutlassFusedMoEMethod # isort: on -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm._torch.modules.gated_mlp import GatedMLP from tensorrt_llm._utils import get_sm_version, mpi_rank from tensorrt_llm.mapping import Mapping @@ -92,8 +89,8 @@ def test_fused_moe(moe_backend, mapping=None): if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") + if get_sm_version() != 90: + pytest.skip("TRITON moe backend is only supported on Hopper") if dtype != torch.bfloat16: pytest.skip("Unsupported for TritonFusedMoE") if routing_cls != RenormalizeMoeRoutingMethod: @@ -192,9 +189,9 @@ def test_fused_moe(moe_backend, # Evaluate outputs torch.cuda.synchronize() # There can be one off mismatch in the outputs due to different kernel implementations - # Here we check 99% of the outputs are within the tolerance + # Here we check most of the outputs are within the tolerance # The CutlassFusedMoE case fails as well without this change on H100 for bf16 - check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.984) + check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.975) m //= 2 @@ -514,7 +511,9 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type): @skip_pre_hopper -@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"]) +@pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", pytest.param("TRITON", marks=skip_no_hopper)]) @pytest.mark.parametrize("routing_cls", [DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod]) @pytest.mark.parametrize("bias", [True, False]) @@ -522,8 +521,6 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type): def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias): if moe_backend == "TRITON": - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") if dtype != torch.bfloat16: pytest.skip("Unsupported for TritonFusedMoE") if routing_cls != RenormalizeMoeRoutingMethod: @@ -632,19 +629,30 @@ def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias): with torch.inference_mode(), autotune(): fused_moe.forward(x, router_logits) - # Explicitly capture context for kernel testing - with AutoTuner.get().capture() as all_tactics, torch.inference_mode(): - output = fused_moe.forward(x, router_logits) - - # Test all kernel tactics - for tactic in all_tactics: - with AutoTuner.get().replay(tactic), torch.inference_mode(): + # TRITON backend uses Triton kernels which don't register with AutoTuner + if moe_backend == "TRITON": + with torch.inference_mode(): output = fused_moe.forward(x, router_logits) - check_accuracy(output, - ref_output, - rtol=0.04, - atol=0.1, - percent=0.99) + check_accuracy(output, + ref_output, + rtol=0.04, + atol=0.1, + percent=0.99) + else: + # Explicitly capture context for kernel testing + with AutoTuner.get().capture() as all_tactics, torch.inference_mode( + ): + output = fused_moe.forward(x, router_logits) + + # Test all kernel tactics + for tactic in all_tactics: + with AutoTuner.get().replay(tactic), torch.inference_mode(): + output = fused_moe.forward(x, router_logits) + check_accuracy(output, + ref_output, + rtol=0.04, + atol=0.1, + percent=0.99) def set_tensor_value_2(x, num_row, num_cols): @@ -1174,7 +1182,7 @@ def test_fused_moe_fp8_blockwise_cute_dsl(dtype, return True -@skip_non_hopper_unittest +@skip_no_hopper @pytest.mark.parametrize( "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode", product( @@ -1306,7 +1314,7 @@ def test_fused_moe_fp8_blockwise_cutlass(dtype, return True -@skip_non_hopper_unittest +@skip_no_hopper @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize("ep_size", [1, 2, 4]) @@ -2526,7 +2534,7 @@ def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend, check_accuracy(output, ref_output, rtol=1e-2, atol=0.1, percent=0.99) -@skip_pre_hopper +@skip_no_hopper @pytest.mark.parametrize("experts", [8, 128]) @pytest.mark.parametrize( "hidden_size, intermediate_size", @@ -2542,12 +2550,8 @@ def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend, @pytest.mark.parametrize("dynamic_quant", [True, False]) def test_fused_moe_triton_mxfp4(experts, hidden_size, intermediate_size, fp8_activation, bias, dynamic_quant): - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - if torch.cuda.get_device_capability()[0] < 10 and fp8_activation: + if fp8_activation: pytest.skip("Latest Triton requires BF16 activation on Hopper") - if torch.cuda.get_device_capability()[0] >= 10 and not fp8_activation: - pytest.skip("Latest Triton requires FP8 activation on Blackwell") mapping = Mapping() mapping.rank = mpi_rank() diff --git a/tests/unittest/_torch/modules/test_triton_linear.py b/tests/unittest/_torch/modules/test_triton_linear.py index 2d5e87ae68..9b1229cf5d 100644 --- a/tests/unittest/_torch/modules/test_triton_linear.py +++ b/tests/unittest/_torch/modules/test_triton_linear.py @@ -5,10 +5,8 @@ import cloudpickle import pytest import torch from mpi4py import MPI -from utils.util import check_accuracy, skip_pre_hopper +from utils.util import check_accuracy, skip_no_hopper -from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ - IS_TRITON_KERNELS_AVAILABLE from tensorrt_llm._torch.modules.linear import Linear from tensorrt_llm._torch.modules.triton_linear import TritonLinear from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig @@ -21,11 +19,10 @@ MPI.pickle.__init__( ) -@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear]) +@pytest.mark.parametrize( + "linear_cls", + [Linear, pytest.param(TritonLinear, marks=skip_no_hopper)]) def test_linear_unquantized(linear_cls): - if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear: - pytest.skip("Triton kernels are not available") - torch.manual_seed(0) torch.cuda.manual_seed(0) num_tokens = 128 @@ -56,11 +53,10 @@ def test_linear_unquantized(linear_cls): check_accuracy(actual_c, reference_c, atol=0.01, rtol=0.01, percent=0.99) -@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear]) +@pytest.mark.parametrize( + "linear_cls", + [Linear, pytest.param(TritonLinear, marks=skip_no_hopper)]) def test_linear_fp8qdq(linear_cls): - if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear: - pytest.skip("Triton kernels are not available") - torch.manual_seed(0) torch.cuda.manual_seed(0) num_tokens = 128 @@ -100,18 +96,12 @@ def test_linear_fp8qdq(linear_cls): percent=0.99) -@skip_pre_hopper +@skip_no_hopper @pytest.mark.parametrize("activation_dtype", [torch.bfloat16, torch.float8_e4m3fn]) def test_linear_mxfp4(activation_dtype): - if not IS_TRITON_KERNELS_AVAILABLE: - pytest.skip("Triton kernels are not available") - if torch.cuda.get_device_capability( - )[0] < 10 and activation_dtype == torch.float8_e4m3fn: + if activation_dtype == torch.float8_e4m3fn: pytest.skip("Latest Triton requires BF16 activation on Hopper") - if torch.cuda.get_device_capability( - )[0] >= 10 and activation_dtype == torch.bfloat16: - pytest.skip("Latest Triton requires FP8 activation on Blackwell") dtype = torch.bfloat16 num_tokens = 128 diff --git a/tests/unittest/others/test_triton_kernels_vendoring.py b/tests/unittest/others/test_triton_kernels_vendoring.py new file mode 100644 index 0000000000..8a88baeb2f --- /dev/null +++ b/tests/unittest/others/test_triton_kernels_vendoring.py @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for vendored triton_kernels package. + +The triton_kernels package is vendored from the Triton project to provide +optimized kernels. These tests verify that the vendoring mechanism works +correctly and that our version takes precedence over any external installation. +""" + +import unittest +from pathlib import Path + + +class TestTritonKernelsVendoring(unittest.TestCase): + def test_triton_kernels_is_vendored_version(self): + """Verify we're using the vendored version (has VERSION and LICENSE files).""" + import tensorrt_llm # noqa: F401, I001 + import triton_kernels + + triton_kernels_path = Path(triton_kernels.__file__).parent + + # VERSION file is added by our vendor script and not present in external installations + version_file = triton_kernels_path / "VERSION" + self.assertTrue( + version_file.exists(), + f"VERSION file not found at {version_file}. " + "This suggests an external triton_kernels is being used instead of our vendored version.", + ) + + # LICENSE file should also be present for compliance + license_file = triton_kernels_path / "LICENSE" + self.assertTrue(license_file.exists(), f"LICENSE file not found at {license_file}.") + + def test_version_matches_requirements(self): + """Verify vendored triton_kernels VERSION matches triton version in requirements.txt.""" + import re + + repo_root = Path(__file__).parent.parent.parent.parent + + version_file = repo_root / "triton_kernels" / "VERSION" + vendored_version = version_file.read_text().strip().split()[0].lstrip("v") + + requirements_file = repo_root / "requirements.txt" + requirements_text = requirements_file.read_text() + + match = re.search(r"^triton==([^\s#]+)", requirements_text, re.MULTILINE) + self.assertIsNotNone(match, "Could not find triton version in requirements.txt") + requirements_version = match.group(1) + + self.assertEqual( + vendored_version, + requirements_version, + f"Vendored triton_kernels version ({vendored_version}) does not match " + f"triton version in requirements.txt ({requirements_version}). " + "To update the vendored triton_kernels, run: python scripts/vendor_triton_kernels.py " + f"--tag v{requirements_version}", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index 2c731e9110..828bf08ea5 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -106,6 +106,9 @@ skip_pre_ada = pytest.mark.skipif( skip_pre_hopper = pytest.mark.skipif( getSMVersion() < 90, reason="This test is not supported in pre-Hopper architecture") +skip_no_hopper = pytest.mark.skipif( + getSMVersion() != 90, + reason="This test is only supported in Hopper architecture") skip_pre_blackwell = pytest.mark.skipif( getSMVersion() < 100, reason="This test is not supported in pre-Blackwell architecture") diff --git a/triton_kernels/LICENSE b/triton_kernels/LICENSE new file mode 100644 index 0000000000..1d0238e86b --- /dev/null +++ b/triton_kernels/LICENSE @@ -0,0 +1,23 @@ +/* +* Copyright 2018-2020 Philippe Tillet +* Copyright 2020-2022 OpenAI +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ diff --git a/triton_kernels/README.md b/triton_kernels/README.md new file mode 100644 index 0000000000..4d6005b9f4 --- /dev/null +++ b/triton_kernels/README.md @@ -0,0 +1,24 @@ +# Vendored triton_kernels + +This directory contains code vendored from the [Triton](https://github.com/triton-lang/triton) project. + +| | | +|---|---| +| **Copyright** | The Triton Authors | +| **License** | MIT (see [LICENSE](LICENSE) file in this directory) | +| **Source** | https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels | +| **Version** | `v3.5.1` | + +## Attribution + +This code is the work of the Triton authors and is included here under the MIT License. +Each Python file includes an attribution header indicating its origin. + +## Do Not Edit + +This code is vendored verbatim and should not be modified directly. +To update to a newer version, run: + +```bash +python scripts/vendor_triton_kernels.py --tag +``` diff --git a/triton_kernels/VERSION b/triton_kernels/VERSION new file mode 100644 index 0000000000..c63f79d80c --- /dev/null +++ b/triton_kernels/VERSION @@ -0,0 +1,3 @@ +v3.5.1 +# This file tracks the version of triton-kernels that was vendored. +# To update, run: python scripts/vendor_triton_kernels.py --tag diff --git a/triton_kernels/__init__.py b/triton_kernels/__init__.py new file mode 100644 index 0000000000..7676bb8f76 --- /dev/null +++ b/triton_kernels/__init__.py @@ -0,0 +1,3 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/__init__.py +# Triton is licensed under the MIT License. diff --git a/triton_kernels/compaction.py b/triton_kernels/compaction.py new file mode 100644 index 0000000000..7b0ed10c35 --- /dev/null +++ b/triton_kernels/compaction.py @@ -0,0 +1,73 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/compaction.py +# Triton is licensed under the MIT License. + +import torch +from .compaction_details._masked_compaction import _masked_compaction +from .tensor import Bitmatrix + + +def compaction(yv, yi, bitmask, sentinel=-1): + """ + Return compacted copies of *yv* and *yi* based on a per-row bitmask. + + Only the elements whose index appears among the active bits of *bitmask* + are kept; the rest are replaced by *sentinel*. Kept elements preserve + their original left-to-right order. + + Parameters + ---------- + yv : torch.Tensor, shape (B, K) + Values tensor. + yi : torch.Tensor, shape (B, K), dtype torch.long + Integer indices (0 ≤ index < 32) associated with *yv*. + bitmask : torch.Tensor, shape (B,) **or** (B, 32) + Per-row mask of active indices. See the in-place version for details. + sentinel : int, default -1 + Value written into dropped positions of the returned tensors. + + Returns + ------- + (yv_out, yi_out) : Tuple[torch.Tensor, torch.Tensor], each shape (B, K) + New tensors with the same dtype/device as the inputs. + + """ + + n_rows, n_cols = yi.shape + ret_yv = torch.empty_like(yv) + ret_yi = torch.empty_like(yi) + if isinstance(bitmask, Bitmatrix): + bitmask = bitmask.storage.data + + _masked_compaction[(n_rows, )]( + yv, yi, bitmask, bitmask.stride(0), bitmask.stride(1), # inputs + ret_yv, ret_yi, # outputs + sentinel, # sentinel + K=n_cols # constants + ) + return ret_yv, ret_yi + + +def compaction_torch(yv: torch.Tensor, yi: torch.Tensor, bitmask: torch.Tensor, sentinel=-1): + """ + reference implementation of `masked_compact` + """ + B, K = yi.shape + device = yi.device + # Expand bitmask to a boolean matrix of active bits (B, 32) + w = (1 << torch.arange(32, device=device, dtype=bitmask.dtype)) + bits = (bitmask.unsqueeze(-1) & w) != 0 + mask = bits.flatten(start_dim=-2) # or bits.reshape(B, -1) + # For every yi element decide whether it should be kept + keep = mask.gather(1, yi.long()) + # Build a stable permutation that brings all "keep" items forward + # False→0, True→1 ==> invert so kept==0, dropped==1, then argsort + order = (~keep).to(torch.int).argsort(dim=1, stable=True) + # Re‑order tensors according to above permutation + yi_sorted = yi.gather(1, order) + yv_sorted = yv.gather(1, order) + # fill relevant positions with sentinel + keep_sorted = keep.gather(1, order) + yi_sorted[~keep_sorted] = sentinel + yv_sorted[~keep_sorted] = sentinel + return yv_sorted, yi_sorted diff --git a/triton_kernels/compaction_details/__init__.py b/triton_kernels/compaction_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/compaction_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/compaction_details/_masked_compaction.py b/triton_kernels/compaction_details/_masked_compaction.py new file mode 100644 index 0000000000..1432644d44 --- /dev/null +++ b/triton_kernels/compaction_details/_masked_compaction.py @@ -0,0 +1,24 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/compaction_details/_masked_compaction.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + + +@triton.jit +def _masked_compaction(Yv, Yi, BitMask, stride_bm, stride_bn, RetYv, RetYi, sentinel, K: tl.constexpr): + pid_m = tl.program_id(0) + yv = tl.load(Yv + pid_m * K + tl.arange(0, K)) + yi = tl.load(Yi + pid_m * K + tl.arange(0, K)) + div = yi // 32 + rem = yi % 32 + active_bits = (tl.load(BitMask + pid_m * stride_bm + div * stride_bn) >> rem) & 1 + exc_cumsum = tl.cumsum(active_bits, 0) - active_bits + active_flags = active_bits.to(tl.int1) + rev_arange = tl.where(active_flags, 0, K - 1 - tl.arange(0, K)) + write_indx = exc_cumsum + rev_arange + yv = tl.where(active_flags, yv, sentinel) + yi = tl.where(active_flags, yi, sentinel) + tl.store(RetYv + pid_m * K + write_indx, yv) + tl.store(RetYi + pid_m * K + write_indx, yi) diff --git a/triton_kernels/matmul_ogs.py b/triton_kernels/matmul_ogs.py new file mode 100644 index 0000000000..b2cc83b2b5 --- /dev/null +++ b/triton_kernels/matmul_ogs.py @@ -0,0 +1,613 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs.py +# Triton is licensed under the MIT License. + +# isort: off +# fmt: off +from dataclasses import dataclass +import itertools +import sys +import torch +import triton +from enum import Enum, auto +import math +# utilities +from triton_kernels import target_info +from triton_kernels.numerics import InFlexData, OutFlexData +from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx +from triton_kernels.target_info import is_cuda +# details +from .matmul_ogs_details._matmul_ogs import _matmul_ogs +from .matmul_ogs_details._p_matmul_ogs import _p_matmul_ogs, get_per_device_per_stream_alloc_fn +from .matmul_ogs_details._reduce_grouped import _reduce_grouped +from .numerics_details.mxfp import MXFP_BLOCK_SIZE +from .matmul_ogs_details.opt_flags import make_opt_flags, update_opt_flags_constraints, InapplicableConstraint +from .specialize import specialize +from .tensor import Storage, Tensor, FP4, bitwidth, wrap_torch_tensor + + +@dataclass(frozen=True) +class FnSpecs: + name: str + fn: "triton.runtime.jit.JITFunction" + fn_arg_names: tuple[str] + fn_arg_do_not_specialize: tuple[str] = tuple() + + @staticmethod + def default(): + return FnSpecs("dflt", None, tuple()) + + +@dataclass(frozen=True) +class FusedActivation: + specs: FnSpecs = FnSpecs.default() + fn_args: tuple[object] = tuple() + reduction_n: int = 1 + + +@dataclass(frozen=True) +class Epilogue: + specs: FnSpecs = FnSpecs.default() + fn_arg_values_matmul: tuple[object] = tuple() + fn_arg_values_finalize: tuple[object] = tuple() + effective_itemsize: float = None + +class FnName(Enum): + QUANTIZE_MXFP8 = auto() + + +EpilogueSpecs = FnSpecs # TODO: remove this alias when callers are updated + +_kernels = dict() + + +def get_kernels(epilogue: FnSpecs = FnSpecs.default(), fused_activation: FnSpecs = FnSpecs.default()): + global _kernels + key = (fused_activation.name, epilogue.name) + if key in _kernels: + return _kernels[key] + spec_constants = { + "ACTIVATION_FN": fused_activation.fn, + "EPILOGUE_FN": epilogue.fn, + } + spec_tuples = { + "activation_fn_args": fused_activation.fn_arg_names, + "epilogue_fn_args": epilogue.fn_arg_names, + } + do_not_specialize = fused_activation.fn_arg_do_not_specialize + epilogue.fn_arg_do_not_specialize + import types + + module = types.ModuleType(f"matmul_ogs_{'_'.join(key)}") + sys.modules[module.__name__] = module + module._matmul_ogs = specialize(_matmul_ogs, module, spec_constants, spec_tuples, + do_not_specialize=do_not_specialize) + module._p_matmul_ogs = specialize(_p_matmul_ogs, module, spec_constants, spec_tuples, + do_not_specialize=do_not_specialize) + module._reduce_grouped = specialize(_reduce_grouped, module, spec_constants, spec_tuples, + do_not_specialize=do_not_specialize) + _kernels[key] = module + return module + + +# ----------------------------------------------------------------------------- +# Matrix Multiplication + Outer Gather/Scatter +# ----------------------------------------------------------------------------- + + +def can_overflow_int32(tensor: torch.Tensor): + max_int32 = (1 << 31) - 1 + offset = 0 + for i in range(tensor.ndim): + offset += (tensor.shape[i] - 1) * tensor.stride(i) + return offset > max_int32 + + +def should_upcast_indices(*args): + return any(tensor is not None and can_overflow_int32(tensor) for tensor in args) + + +# --------------------- +# Numerics +# --------------------- + +# fmt: off + +@dataclass(frozen=True) +class FlexCtx: + lhs_data: InFlexData = InFlexData() + rhs_data: InFlexData = InFlexData() + out_data: OutFlexData = OutFlexData() + +@dataclass +class PrecisionConfig: + max_num_imprecise_acc: int = None + allow_tf32: bool = True + flex_ctx: FlexCtx = FlexCtx() + acc_scale: int = 1.0 + flexpoint_saturate_inf: bool = False + report_quantization_err_fn: callable = None + act_scale: Tensor | None = None + weight_scale: Tensor| None = None + out_scale: Tensor | None = None + out_dtype: torch.dtype = None + enforce_bitwise_invariance: bool = False + + +# TODO: merge in opt_flags +def get_swap_xw(precision_config, opt_flags): + if target_info.cuda_capability_geq(10, 0): + return precision_config.weight_scale is not None and opt_flags.block_m <= 64 and opt_flags.is_persistent + return False + +# --------------------- +# Allocation +# --------------------- + +@dataclass +class MatmulAllocation: + device: str + output: tuple[tuple[int], torch.dtype] + scratchpads: dict[str, tuple] + +def init_allocation(x, w, precision_config, fused_activation, routing_data, gather_indx, scatter_indx, opt_flags): + # ---- output ------ + N = w.shape[-1] + # by default - M is number of rows in the activations + M = x.shape[-2] + # if the activations are gathered, then M is number of gather indices + if gather_indx is not None: + M = gather_indx.src_indx.shape[0] + # final output + if routing_data.n_expts_act == 1 or scatter_indx is None: + y_rows = M + else: + Mc = scatter_indx.src_indx.shape[0] // routing_data.n_expts_act # compressed number of rows + y_rows = Mc + batch_dim = x.shape[0] if x.ndim == 3 else 1 + out_shape = (batch_dim, y_rows, N // fused_activation.reduction_n) + out_dtype = precision_config.out_dtype or x.dtype + output = (out_shape, out_dtype) + # ---- scratchpad -----# + scratchpad = dict() + if opt_flags.split_k > 1 or (scatter_indx is not None and not opt_flags.fused_scatter): + scratch_out_dtype = torch.float32 if opt_flags.split_k > 1 else out_dtype + scratchpad["matmul"] = ((opt_flags.split_k, 1, M, N), scratch_out_dtype) + if "matmul" in scratchpad and precision_config.out_scale is not None: + scratchpad["mx_out_scale"] = ((opt_flags.split_k, 1, M, triton.cdiv(N, MXFP_BLOCK_SIZE)), torch.uint8) + return MatmulAllocation(x.device, output, scratchpad) + +def apply_allocation(allocation: MatmulAllocation, output): + ret = dict() + if output is None: + output = torch.empty(allocation.output[0], device=allocation.device, dtype=allocation.output[1]) + else: + assert output.shape == allocation.output[0] + ret["output"] = output[None, :, :] + ret["scratchpad"] = { + k: torch.empty(v[0], device=allocation.device, dtype=v[1]) + for k, v in allocation.scratchpads.items() + } + return ret + +# ----------------------------------------------------------------------------- +# Canonicalize +# ----------------------------------------------------------------------------- +# the `matmul_ogs` kernel can operate on 2D or 3D inputs depending on the mode being used +# we can canonicalize storages to make the implementation more uniform + +def _canonicalize_storage(storage, out_ndim, flex_data): + assert out_ndim >= storage.data.ndim + # Need to use as_strided instead of view because for a tensor with + # shape[-2] == 1 can have ambuiguity related to col-wise. Fo example, + # > t = torch.randn(2, 5, 1).mT + # > t_view = t.view(t.shape) + # > t.stride(), t_view.stride() + # ((5, 1, 1), (5, 5, 1)) + # Our check t_view is col-wise fails since t_view.stride(-2) != 1 + # This case is covered by (m, n, k) == (1000, 700, 2) in test_matmul.py + new_storage_shape = [1] * (out_ndim - storage.data.ndim) + list(storage.data.shape) + new_storage_view = storage.data.view(new_storage_shape) + new_storage_stride = [new_storage_view.stride(0)] * (out_ndim - storage.data.ndim) + list(storage.data.stride()) + new_storage_data = storage.data.as_strided(new_storage_shape, new_storage_stride) + if flex_data is not None: + new_storage_data = flex_data.reinterpret(new_storage_data) + return Storage(new_storage_data, storage.layout) + +# + +def reduce_grouped(x: torch.Tensor, indx: torch.Tensor, out: torch.Tensor, out_mx_scale: torch.Tensor, + fused_activation, epilogue, + x_flex: InFlexData | None = None, + out_flex: OutFlexData | None = None, x_mx_scale: torch.Tensor | None = None, + out_dtype: bool = None, flexpoint_saturate_inf: bool = False): + """ + In-place grouped row reduction. + + Arguments + - x: Tensor[AnyFloat] of shape [(num_groups * K), N] + - indx: Tensor[Int] of shape [num_groups, K] + + Description + For each group g in [0, num_groups), this routine sums the K rows of `x` + specified by `indx[g, :]` and overwrites the row corresponding to the first + valid (non-negative) index with the per-group sum. Accumulation is performed + in float32 for numerical stability, and the result is written back in the + dtype of `x`. + + Behavior and edge cases + - Invalid (-1) entries are skipped during accumulation and do not generate + memory traffic. If a group has no valid entries, nothing is written for + that group. + - Reduction is performed tile-by-tile along the N dimension within a single + kernel launch (persistent along N) to minimize launch overhead. + + Performance notes + - Memory traffic per group is approximately (valid_rows_read + 1) * N * sizeof(x), + plus index reads. With no invalid entries, this becomes (K + 1) reads/writes + of length N per group. + + Returns + - The input tensor `x` (modified in place). + """ + if indx is None and x.shape[0] == 1: + return x.squeeze(0), None + if indx is not None: + num_groups = indx.shape[0] + else: + num_groups = x.shape[-2] + if x_flex is None: + x_flex = InFlexData() + if out_flex is None: + out_flex = OutFlexData() + K = 1 if indx is None else indx.shape[1] + out_dtype = x.dtype if out_dtype is None else out_dtype + assert x.shape[-1] % fused_activation.reduction_n == 0 + BLOCK_N = 512 + # Resolve scalar flex scales (may be None) + x_expected_scale = None if x_flex is None else x_flex.scale + out_expected_scale = None if out_flex is None else out_flex.expected_scale + out_actual_scale = None if out_flex is None else out_flex.actual_scale + out_checksum_scale = None if out_flex is None else out_flex.checksum_scale + # Resolve MXFP output scale row stride + stride_mxb = 0 if x_mx_scale is None else x_mx_scale.stride(0) + stride_mxs = 0 if x_mx_scale is None else x_mx_scale.stride(1) + stride_omxs = 0 if out_mx_scale is None else out_mx_scale.stride(0) + kernels = get_kernels(epilogue.specs, fused_activation.specs) + kernels._reduce_grouped[(num_groups, )]( + x_flex.reinterpret(x), x.stride(0), x.stride(2), x.stride(3), # + x_expected_scale, # scalar input scale + out_flex.reinterpret(out), out.stride(1), out.stride(2), # + out_expected_scale, out_actual_scale, out_checksum_scale, indx, # + x.shape[0], x.shape[-1], # + x_mx_scale, stride_mxb, stride_mxs, # + out_mx_scale, stride_omxs, # + *fused_activation.fn_args, fused_activation.reduction_n, + *epilogue.fn_arg_values_finalize, + HAS_IN_MX_SCALE=x_mx_scale is not None, HAS_OUT_MX_SCALE=out_mx_scale is not None, + FLEXPOINT_SATURATE_INF=flexpoint_saturate_inf, # + BLOCK_N=BLOCK_N, K=K, # + num_warps=1, # + ) + return out, out_mx_scale + +# ----------------------------------------------------------------------------- +# Triton Implementation +# ----------------------------------------------------------------------------- + +def matmul_ogs_set_idle_sms(num_idle_sms): + """ + persistent kernels will leave `num_idle_sms` idle + """ + update_opt_flags_constraints({"idle_sms": num_idle_sms}) + +def matmul_ogs(x, w, bias, + routing_data: RoutingData | None = None, + gather_indx: GatherIndx | None = None, + scatter_indx: ScatterIndx | None = None, + precision_config: PrecisionConfig | None = None, + betas: torch.Tensor | None = None, + gammas: torch.Tensor | None = None, + out_alpha: float | None = None, + y: torch.Tensor | None = None, + fused_activation: FusedActivation | None = None, + epilogue: Epilogue | None = None, + ): + """ + Y[:, :] = 0. + for e in num_experts: + Y[idxs_y_m(e), :] += matmul(X[idxs_x_m(e), :], W[e, :, :]) + """ + is_input_batched = x.ndim == 3 + if is_input_batched: + assert gather_indx is None, "gather not supported in batched mode" + assert scatter_indx is None, "scatter not supported in batched mode" + assert routing_data is None, "routing not supported in batched mode" + assert w.ndim == 3 and w.shape[0] == x.shape[0] + # canonicalize inputs + if precision_config is None: + precision_config = PrecisionConfig() + if fused_activation is None: + fused_activation = FusedActivation(FnSpecs.default(), tuple(), 1) + if epilogue is None: + epilogue = Epilogue(FnSpecs.default(), tuple(), tuple(), False) + if routing_data is None: + routing_data = RoutingData(None, None, max(1, w.shape[0]), 1) + # unpack scales + w_scale = precision_config.weight_scale + w_has_mx = w_scale is not None + is_hopper_fp8 = is_cuda() and not target_info.cuda_capability_geq(10, 0) and bitwidth(w.dtype) == 8 + if is_hopper_fp8: assert w.stride(-2) == 1, "`w` must be column-major when it has data-type FP8 on capability < 10" + if not isinstance(w, Tensor): + # TODO: remove this code path; using uint8 for mxfp4 weight will bite us when we want to support uint8 for real + dtype = FP4 if w.dtype == torch.uint8 else w.dtype + w = wrap_torch_tensor(w, dtype=dtype) + if w_scale is not None and not isinstance(w_scale, Tensor): + w_scale = Tensor(w_scale) + if w_scale is not None: + w_scale.storage.data = w_scale.data.view(torch.uint8) + w_scale.dtype = torch.uint8 + x_scale = precision_config.act_scale + x_has_mx = x_scale is not None + if x_has_mx: assert x.stride(-1) == 1, "'x' must be row-major when it has data-type mxfp" + if x_scale is not None and not isinstance(x_scale, Tensor): + x_scale = Tensor(x_scale) + if not isinstance(x, Tensor): + x = Tensor(x, dtype=x.dtype) + # determine shapes + has_gather = gather_indx is not None + has_scatter = scatter_indx is not None + is_ragged = routing_data.expt_hist is not None + M = x.shape[-2] if gather_indx is None else gather_indx.src_indx.shape[0] + batch_size = w.shape[0] if routing_data.expt_hist is None and w.ndim == 3 else 1 + K, N = w.shape[-2:] + assert K == x.shape[-1] + if x.ndim == 3 and w.ndim == 3: + assert x.shape[0] == w.shape[0] + # compute optimization flags + out_dtype = precision_config.out_dtype or x.dtype + can_use_tma = x.numel() > 0 and x.storage.is_tma_compliant() and \ + w.numel() > 0 and w.storage.is_tma_compliant() and \ + (w_scale is None or w_scale.storage.is_tma_compliant()) + # hopper w/ mxfp4 doesn't support TMA + can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4) + can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1) + opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config, + M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize, + ) + if not can_use_fused_scatter and opt_flags.fused_scatter: + raise InapplicableConstraint("Fused scatter is not supported") + if w_scale is not None and opt_flags.is_persistent and not target_info.has_native_mxfp(): + raise NotImplementedError("Must use non-persistent kernel for simulated MXFP") + if w_scale is not None and w_scale.storage.layout.name is not None and not opt_flags.is_persistent and target_info.has_native_mxfp(): + raise NotImplementedError("Must use persistent kernel and be TMA-compliant for native MXFP") + # fused activation + matmul_fused_activation = fused_activation + reduce_fused_activation = FusedActivation() + if opt_flags.split_k > 1 or (scatter_indx is not None and not opt_flags.fused_scatter): + matmul_fused_activation, reduce_fused_activation = reduce_fused_activation, matmul_fused_activation + # allocate output/scratchpad memory + allocation = init_allocation(x, w, precision_config, fused_activation, + routing_data, gather_indx, scatter_indx, opt_flags) + memory = apply_allocation(allocation, y) + # early exit + if batch_size * M * N == 0: + ret = memory["output"].squeeze(0) + if not is_input_batched: + ret = ret.squeeze(0) + return ret + # TMA descriptors require a global memory allocation + if opt_flags.is_persistent: + triton.set_allocator(get_per_device_per_stream_alloc_fn(x.device)) + # Intermediate tensors and postprocess kernels for each situation + has_scratchpad = "matmul" in memory["scratchpad"] + # Canonical output tensor (matmul scratchpad if present, otherwise final output tensor) + out_matmul = memory["scratchpad"].get("matmul", memory["output"]) + out_matmul_flex = OutFlexData() if out_matmul.dtype == torch.float32 else precision_config.flex_ctx.out_data + # Unified mx-scale pointer; when scratchpad exists, prefer its mx buffer + out_matmul_scale = precision_config.out_scale + if out_matmul_scale is not None: + out_matmul_scale = out_matmul_scale.data.view(torch.uint8) + if has_scratchpad and "mx_out_scale" in memory["scratchpad"]: + out_matmul_scale = memory["scratchpad"]["mx_out_scale"] + out_matmul_has_mx = out_matmul_scale is not None and out_matmul.element_size() == 1 + # matrix multiplication + flex = precision_config.flex_ctx + bias_stride = None if bias is None else bias.stride(0) + num_indx = None if scatter_indx is None else scatter_indx.src_indx.shape[0] + # moe metadata + expt_data = routing_data.expt_data + block_m = opt_flags.block_m + expt_hist = None if expt_data is None else expt_data.hist + expt_hist_sum = None if expt_data is None else expt_data.token_offs_pad[block_m][-1] + expt_token_offs_raw = None if expt_data is None else expt_data.token_offs_raw + expt_block_pid_map = None if expt_data is None else expt_data.block_pid_map[block_m] + # spmd grid + grid_m = triton.cdiv(M, opt_flags.block_m) + if expt_block_pid_map is not None: + grid_m = routing_data.n_blocks(M, opt_flags.block_m) + grid_n = triton.cdiv(N, opt_flags.block_n) + max_grid = batch_size * grid_m * grid_n * opt_flags.split_k + grid = min(target_info.num_sms() - opt_flags.idle_sms, max_grid) if opt_flags.is_persistent else max_grid + # canonicalize storage + has_gather_tma = has_gather and target_info.has_tma_gather() + has_scatter_tma = opt_flags.fused_scatter and target_info.has_tma_gather() + y = wrap_torch_tensor(out_matmul.view(math.prod(out_matmul.shape[:-1]), out_matmul.shape[-1]) if opt_flags.fused_scatter else out_matmul.view(math.prod(out_matmul.shape[:-2]), *out_matmul.shape[-2:])) + x_storage = _canonicalize_storage(x.storage, 2 if has_gather_tma else 3, flex.lhs_data) + w_storage = _canonicalize_storage(w.storage, 3, flex.rhs_data) + y_storage = _canonicalize_storage(y.storage, 2 if has_scatter_tma else 3, flex.out_data) + # create tma descriptor for x + x_has_tma = opt_flags.is_persistent and (has_gather_tma or not has_gather) + x_tma_block_size = [1, opt_flags.block_k] if has_gather_tma else [1, opt_flags.block_m, opt_flags.block_k] + x_tma_mode = None if not x_has_tma else "ragged" if is_ragged and not has_gather_tma else "dense" + x_tensor_or_tma = x_storage.make_tma(x_tma_block_size, x_tma_mode) if x_has_tma else x_storage.data + # create tma descriptor for y + y_has_tma = opt_flags.is_persistent and (has_scatter_tma or not opt_flags.fused_scatter) + block_n = opt_flags.block_n // opt_flags.epilogue_subtile // matmul_fused_activation.reduction_n + y_tma_block_size = [1, block_n] if has_scatter_tma else [1, opt_flags.block_m, block_n] + y_tma_mode = None if not y_has_tma else "ragged" if is_ragged and not has_scatter_tma else "dense" + y_tensor_or_tma = y_storage.make_tma(y_tma_block_size, y_tma_mode) if y_has_tma else y_storage.data + # create tma descriptor for w + w_has_tma = opt_flags.is_persistent + w_tensor_or_tma = w_storage.make_tma([1, opt_flags.block_k, opt_flags.block_n], "dense") if w_has_tma else w_storage.data + # create tma descriptor for w_scale + w_scale_tensor_or_tma = w_scale + w_scale_has_tma = opt_flags.is_persistent and w_scale is not None + w_scale_tensor_or_tma = w_scale.storage.make_tma([opt_flags.block_n, opt_flags.block_k], "dense") if w_scale_has_tma else w_scale + # canonicalize strides + x_strides = [0]*(3 - x_storage.data.ndim) + list(x_storage.data.stride()) + x_scale_strides = x_scale.stride() if x_has_mx else (None, None, None) + x_scale_strides = (0, ) * (3 - len(x_scale_strides)) + x_scale_strides + w_scale_strides = w_scale.stride() if w_has_mx and not w_scale_has_tma else (None, None, None) + w_scale_strides = (0, ) * (3 - len(w_scale_strides)) + w_scale_strides + out_matmul_scale_strides = out_matmul_scale.stride() if out_matmul_has_mx else (None, None, None, None) + out_matmul_scale_strides = (0, ) * (3 - len(out_matmul_scale_strides)) + out_matmul_scale_strides + # launch kernel + kernels = get_kernels(epilogue.specs, matmul_fused_activation.specs) + # When stride(-2) == stride(-1) == 1, it's ambiguous whether W is transposed + # (i.e. col-wise). Since this matters when w_has_mx is True and w_transpose + # is True the fast code path, stride(-2) == 1 takes precedence, e.g., vs. + # w_transpose = w_storage.data.stride()[-1] != 1 + w_transpose = w_storage.data.stride()[-2] == 1 + (kernels._p_matmul_ogs if opt_flags.is_persistent else kernels._matmul_ogs)[(grid,)]( + y_tensor_or_tma, y_storage.data, *out_matmul.stride(), + *((None, out_matmul_scale, None) if out_matmul_has_mx else out_matmul_flex), + *out_matmul_scale_strides[-3:], + x_tensor_or_tma, x_storage.data, *x_strides, + flex.lhs_data.scale, + None if x_scale is None else x_scale.data.view(torch.uint8), *x_scale_strides, + w_tensor_or_tma, w_storage.data, *w_storage.data.stride(), w_transpose, + flex.rhs_data.scale, + w_scale_tensor_or_tma, *w_scale_strides, + bias, bias_stride, + x.shape[-2], + x.shape[-2] if routing_data.expt_hist is None else None, + N, K, + betas, gammas, + None if gather_indx is None else gather_indx.src_indx, + None if scatter_indx is None else scatter_indx.src_indx, + num_indx, + None if not opt_flags.fused_scatter else scatter_indx.dst_indx, + None if not opt_flags.fused_scatter else scatter_indx.dst_indx.shape[0], + expt_hist, expt_token_offs_raw, expt_hist_sum, expt_block_pid_map, + batch_size, grid_m, grid_n, + out_alpha, + *matmul_fused_activation.fn_args, matmul_fused_activation.reduction_n, + *epilogue.fn_arg_values_matmul, + routing_data.n_expts_tot, routing_data.n_expts_act, + precision_config.max_num_imprecise_acc, + precision_config.allow_tf32, + precision_config.flexpoint_saturate_inf, + flex.rhs_data.is_per_batch, + opt_flags.block_m, + opt_flags.block_n, + opt_flags.block_k, + opt_flags.group_m, + XCD_SWIZZLE=opt_flags.xcd_swizzle, + SWIZZLE_MX_VALUE=w.storage.layout.name, + SWIZZLE_MX_SCALE=None if w_scale is None else w_scale.storage.layout.name, + EPILOGUE_SUBTILE=opt_flags.epilogue_subtile, + SPLIT_K=opt_flags.split_k, + EVEN_K=K % opt_flags.block_k == 0, + W_CACHE_MODIFIER=opt_flags.w_cache_modifier, + TOKENS_PER_EXPT_FOR_ANNOTATION=routing_data.expected_tokens_per_expt, + num_warps=opt_flags.num_warps, + num_stages=opt_flags.num_stages, + arch=opt_flags.arch, + UPCAST_INDICES=should_upcast_indices(x, w, out_matmul), + X_TMA_MODE=x_tma_mode, + Y_TMA_MODE=y_tma_mode, + SWAP_XW=get_swap_xw(precision_config, opt_flags), + IS_EPILOGUE_QUANT_MXFP8=epilogue.specs.name == FnName.QUANTIZE_MXFP8.name, + NUM_SMS = grid if opt_flags.is_persistent else 0, + **opt_flags.target_kernel_kwargs) + # Build grouped reduction inputs in a uniform way + group_indx = None if scatter_indx is None or opt_flags.fused_scatter else scatter_indx.src_indx.view(-1, routing_data.n_expts_act) + out_final, out_final_mx_scale = reduce_grouped( + out_matmul, + group_indx, + memory["output"].squeeze(0), + precision_config.out_scale, + reduce_fused_activation, + epilogue, + x_flex=InFlexData(dtype=out_matmul_flex.dtype, scale=out_matmul_flex.expected_scale), + out_flex=precision_config.flex_ctx.out_data, + x_mx_scale=out_matmul_scale.squeeze(1) if out_matmul_has_mx else None, + out_dtype=memory["output"].dtype, + flexpoint_saturate_inf=precision_config.flexpoint_saturate_inf, + ) + if not is_input_batched: + out_final = out_final.squeeze(0) + if out_final_mx_scale is not None: + precision_config.out_scale = out_final_mx_scale + return out_final + +# ----------------------------------------------------------------------------- +# Reference Implementation +# ----------------------------------------------------------------------------- + +def matmul_ogs_torch(x, w, bias, + routing_data: RoutingData = None, + gather_indx: GatherIndx = None, + scatter_indx: ScatterIndx = None, + precision_config: PrecisionConfig = None, + betas = None, + gammas = None, + round_x = None, round_y = None, + ): + is_input_batched = x.ndim == 3 + assert x.dtype.itemsize > 1 + assert w.dtype.itemsize > 1 + if is_input_batched: + assert gather_indx is None, "gather not supported in batched mode" + assert scatter_indx is None, "scatter not supported in batched mode" + assert routing_data is None, "routing not supported in batched mode" + assert w.ndim == 3 and w.shape[0] == x.shape[0] + if round_x is None: + round_x = lambda x, idx: x + if round_y is None: + round_y = lambda x: x + if bias is not None and bias.ndim == 1: + bias = bias.view(1, *bias.shape) + if w.ndim == 2: + w = w.view(1, *w.shape) + if x.ndim == 2: + x = x.view(1, *x.shape) + if routing_data is None: + routing_data = RoutingData(None, None, w.shape[0], 1) + n_expts_act = routing_data.n_expts_act + # memory offsets + if routing_data.n_expts_tot > 1 and not is_input_batched: + sizes = routing_data.expt_hist + off = torch.zeros(sizes.shape[0] + 1, dtype=torch.int32) + off[1:] = torch.cumsum(sizes, 0) + offs = list(itertools.pairwise(off)) + else: + offs = [[0, x.shape[1]] for _ in range(w.shape[0])] + # compute + n_rows = x.shape[1] if gather_indx is None else gather_indx.dst_indx.shape[0] + y = torch.zeros((x.shape[0], n_rows, w.shape[-1]), device=x.device, dtype=x.dtype) + for i, (lo, hi) in enumerate(offs): + if gather_indx is None: + idx = torch.arange(lo, hi, device=x.device) + else: + idx = gather_indx.src_indx[lo:hi] // n_expts_act + batch = i if is_input_batched else 0 + out = torch.matmul(round_x(x[batch, idx, :], torch.arange(lo, hi, device="cuda")).float(), + w[i].float()) + if bias is not None: + out += bias[i, :] if betas is None else bias[i, :] * betas[lo:hi, None] + if gammas is not None: + out *= gammas[lo:hi, None] + y[batch, lo:hi, :] = round_y(out) + if not is_input_batched: + y = y.view(y.shape[1], y.shape[2]) + if scatter_indx is None: + return y + # accumulate output from all experts + n_rows = y.shape[0] // n_expts_act + out = torch.zeros((n_rows, y.shape[-1]), dtype=torch.float32, device=x.device) + for i, (lo, hi) in enumerate(offs): + dst_idx = scatter_indx.dst_indx[lo:hi] // n_expts_act + msk = dst_idx != -1 + out[dst_idx[msk], :] += y[lo:hi, :][msk, :].float() + return out diff --git a/triton_kernels/matmul_ogs_details/__init__.py b/triton_kernels/matmul_ogs_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/matmul_ogs_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/matmul_ogs_details/_common.py b/triton_kernels/matmul_ogs_details/_common.py new file mode 100644 index 0000000000..c149dcf10e --- /dev/null +++ b/triton_kernels/matmul_ogs_details/_common.py @@ -0,0 +1,169 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py +# Triton is licensed under the MIT License. + +import torch + +import triton +import triton.language as tl + +# ----------------------------------------------------------------------------- +# Utilities +# ----------------------------------------------------------------------------- + + +@triton.constexpr_function +def get_scaled_dot_format_string(dtype: tl.dtype): + mapping = { + tl.float16: "fp16", + tl.bfloat16: "bf16", + tl.uint8: "e2m1", + tl.float8e4nv: "e4m3", + tl.float8e5: "e5m2", + } + return mapping[dtype] + + +@triton.jit +def xcd_swizzle(pid, domain_size, XCD_SWIZZLE: tl.constexpr): + """ + Swizzle the program id based on integer XCD_SWIZZLE. + This is useful for reording how blocks are ordered. A scheduler may, for example, + assign sequential blocks 0, 1, 2, 3, ..., 8, 9, 10.. to its 8 hardware units 0, 1, 2, 3, ..., 0, 1, 2. + This pattern may not be ideal for memory access, and it may be better to swizzle so the assignment + becomes 0, 0, 0, 0, ..., 1, 1, 1, ... In the swizzled arrangement, sequential blocks are assigned to + the same hardware unit. + """ + # Number of pids per group in the new arrangement + pids_per_group = domain_size // XCD_SWIZZLE + extra_pid_groups = domain_size % XCD_SWIZZLE + + # Compute current current and local pid within the group + group = pid % XCD_SWIZZLE + local_pid = pid // XCD_SWIZZLE + + # Calculate new pid based on the new grouping + new_pid = group * pids_per_group + min(group, extra_pid_groups) + local_pid + return new_pid + + +@triton.jit +def swizzle2d(pid, grid_m, grid_n, GROUP_M: tl.constexpr): + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + tl.assume(group_size >= 0) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + return pid_m, pid_n + + +def make_matmul_repr(base_name, order): + + def matmul_repr(specialization): + signature = specialization.signature + constants = specialization.constants + reorder = lambda L: [L[i] for i in order] + layout = lambda stride: "N" if stride in constants else "T" + + def convert_dtype(dtype): + if "tensordesc" in dtype: + ret = convert_dtype(dtype.split("<")[1].split("[")[0]) + return ret + elif "u8" in dtype: + return "mxfp4" + elif dtype[0] == "*": + return dtype[1:] + else: + return dtype + + dtypes = "x".join([convert_dtype(f"{signature[i]}") for i in reorder(["Y", "X", "W"])]) + layouts = "".join([f"{layout(i)}" for i in reorder(["stride_y_n", "stride_x_k", "stride_w_n"])]) + blocks = "x".join([f"{constants[i]}" for i in ["BLOCK_M", "BLOCK_N", "BLOCK_K", "SPLIT_K"]]) + # mode = [] + # if "GatherIndx" not in constants: + # mode += ['g'] + # if "ScatterSrcIndx" not in constants: + # mode += ['s'] + # suffix = "" if not mode else "_o" + (''.join(mode)) + # if base_name.startswith("_p"): + # suffix += "_ptma" + return f"{base_name}_{layouts}_{dtypes}_{blocks}" + + return matmul_repr + + +def matmul_launch_metadata(grid, kernel, args): + from ..proton_opts import launch_metadata_allow_sync + + ret = dict() + M, N, K = args["M"], args["N"], args["K"] + Y, X, W = args["YPtr"], args["XPtr"], args["WPtr"] + tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION") + hist = args["ExptHist"] + if hist is not None: + # If annotation is given, use that to generate name for profiling. + if tokens_per_expt is not None: + n_rows = f"{tokens_per_expt}*" + elif launch_metadata_allow_sync(): + n_rows = int(hist.float().mean()) + else: + n_rows = "unknown" + + if launch_metadata_allow_sync(): + n_tokens = float(hist.sum()) + n_w_bytes = (W.numel() * W.element_size() // hist.numel()) * (hist > 0).sum() + elif tokens_per_expt is not None: + n_tokens = tokens_per_expt * args["N_EXPTS_TOT"] + # This may not be totally correct (e.g., we might not be using all experts) + # but it's better than nothing. + n_w_bytes = W.numel() * W.element_size() + else: + n_tokens = None + n_w_bytes = 0 + + # If annotation is given, use that to generate name for profiling. + tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION") + n_rows = f"{tokens_per_expt}*" if tokens_per_expt is not None else n_rows + else: + n_tokens = None + n_w_bytes = W.numel() * W.element_size() + repr = lambda s, x: f"{s} = {x}" if x is not None else f"E_{len(hist)}({s}) = {n_rows}" + nbits = X.dtype.itemsize * 8 + batch_repr = "" + if "batch_size" in args and args["batch_size"] > 1: + batch_repr = repr("B", args["batch_size"]) + ", " + ret["name"] = f"{kernel.name} [{batch_repr}{repr('M', M)}, {repr('N', N)}, {repr('K', K)}] stg{kernel.num_stages}" + ep_subtile = args["EPILOGUE_SUBTILE"] + if ep_subtile is not None and ep_subtile > 1: + ret["name"] += f" ep/{ep_subtile}" + + if hist is not None and n_tokens is None: + return ret # Don't fill metadata because we can't compute them properly. + + fM = M if M is not None else n_tokens + fK = K if K is not None else n_tokens + ret[f"flops{nbits}"] = 2.0 * fM * N * fK + + gindx = args.get("GatherIndx", None) + # sindx = args.get("WriteBackIndx", None) + n_x_bytes = X.numel() * X.element_size() + n_y_bytes = Y.numel() * Y.element_size() + if hist is not None: + assert n_tokens is not None + n_expts_act = args["N_EXPTS_ACT"] + + if (gindx is not None) and launch_metadata_allow_sync(): + # recreate inverse GatherIndx. + dst = torch.full_like(gindx, -1) + idx = torch.arange(len(gindx), device=gindx.device, dtype=torch.int32) + mask = (gindx != -1) + dst[gindx[mask]] = idx[mask] + n_read_rows = (dst.view((-1, n_expts_act)) != -1).any(dim=1).sum() + else: + n_read_rows = n_tokens + n_x_bytes = n_read_rows * X.shape[-1] * X.element_size() + n_y_bytes = n_tokens * Y.shape[-1] * Y.element_size() + ret["bytes"] = int(n_x_bytes + n_y_bytes + n_w_bytes) + + return ret diff --git a/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/triton_kernels/matmul_ogs_details/_matmul_ogs.py new file mode 100644 index 0000000000..9fcffec8ef --- /dev/null +++ b/triton_kernels/matmul_ogs_details/_matmul_ogs.py @@ -0,0 +1,433 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py +# Triton is licensed under the MIT License. + +# isort: off +# fmt: off +import triton +import triton.language as tl +from triton_kernels.tensor_details.layout_details.blackwell_scale import unswizzle_mx_scale_bw +from triton_kernels.tensor_details.layout_details.hopper_scale import unswizzle_mxfp4_scale_hopper +from triton_kernels.tensor_details.layout_details.hopper_value import mxfp4_to_bf16_triton +from triton_kernels.tensor_details.layout_details.cdna4_scale import unswizzle_mx_scale_cdna4 +from triton_kernels.numerics_details.flexpoint import float_to_flex, load_scale +from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE +from ._common import make_matmul_repr, matmul_launch_metadata, swizzle2d, xcd_swizzle, get_scaled_dot_format_string + + +@triton.jit +def _zero_masked_rows( + pid_m, pid_n, + Y, stride_y_m, stride_y_n, + N, + ScatterSrcIndx, num_idxs, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr): + offs_m = BLOCK_M * pid_m.to(tl.int64) + tl.arange(0, BLOCK_M) + offs_n = BLOCK_N * pid_n + tl.arange(0, BLOCK_N) + src_idx = tl.load(ScatterSrcIndx + offs_m, mask=offs_m < num_idxs, other=0) + YPtrs = Y + offs_m[:, None] * stride_y_m + offs_n[None, :] * stride_y_n + mask_n = offs_n < N + mask = (src_idx == -1)[:, None] & mask_n[None, :] + tl.store(YPtrs, tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32), mask=mask) + + +_matmul_ogs_repr = make_matmul_repr("_matmul_ogs", [0, 1, 2]) +@triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"], + repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata) +def _matmul_ogs( + Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n, + YExpectedScale, YActualScale, YChecksumScale, + stride_y_mx_z, stride_y_mx_m, stride_y_mx_n, + X, XPtr, stride_x_z, stride_x_m, stride_x_k, + XScale, + XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k, + W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr, + WScale, + WMxScale, stride_w_mx_e, stride_w_mx_k, stride_w_mx_n, + B, stride_b_e, # Bias + NRows, M, N, K, # shapes + # expt data + Betas, Gammas, + GatherIndx, + ScatterSrcIndx, num_idxs, + WriteBackIndx, writeback_size, + ExptHist, ExptOffs, ExptOffsSum, ExptData, + # true grid size + batch_size, grid_m, grid_n, + # Out scale + out_alpha, + # fused activation function + ACTIVATION_FN: tl.constexpr, activation_fn_args, ACTIVATION_REDUCTION_N: tl.constexpr, + # epilogue transform + EPILOGUE_FN: tl.constexpr, epilogue_fn_args, + # MoE config + N_EXPTS_TOT: tl.constexpr, N_EXPTS_ACT: tl.constexpr, + # precision config + MAX_NUM_IMPRECISE_ACC: tl.constexpr, ALLOW_TF32: tl.constexpr, + FLEXPOINT_SATURATE_INF: tl.constexpr, + PER_BATCH_SCALE: tl.constexpr, + # optimization config + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, XCD_SWIZZLE: tl.constexpr, + # One of ["HOPPER", "BLACKWELL", None] + SWIZZLE_MX_VALUE: tl.constexpr, + # One of ["HOPPER", "BLACKWELL", None] + SWIZZLE_MX_SCALE: tl.constexpr, + EPILOGUE_SUBTILE: tl.constexpr, + EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, + W_CACHE_MODIFIER: tl.constexpr, + NUM_SMS: tl.constexpr, + X_TMA_MODE: tl.constexpr, + Y_TMA_MODE: tl.constexpr, + TOKENS_PER_EXPT_FOR_ANNOTATION=None, + UPCAST_INDICES: tl.constexpr = False, + SWAP_XW: tl.constexpr = False, + IS_EPILOGUE_QUANT_MXFP8: tl.constexpr = False): + + tl.assume(stride_y_k >= 0) + tl.assume(stride_y_z >= 0) + tl.assume(stride_y_m >= 0) + tl.assume(stride_y_n >= 0) + tl.assume(stride_x_z >= 0) + tl.assume(stride_x_m >= 0) + tl.assume(stride_x_k >= 0) + tl.assume(stride_w_e >= 0) + tl.assume(stride_w_k >= 0) + tl.assume(stride_w_n >= 0) + if stride_w_mx_e is not None: + tl.assume(stride_w_mx_e >= 0) + if stride_w_mx_k is not None: + tl.assume(stride_w_mx_k >= 0) + if stride_w_mx_n is not None: + tl.assume(stride_w_mx_n >= 0) + if B is not None: + tl.assume(stride_b_e >= 0) + tl.assume(batch_size >= 0) + tl.assume(grid_m >= 0) + tl.assume(grid_n >= 0) + + is_w_microscaled: tl.constexpr = WMxScale is not None + MX_PACK_DIVISOR: tl.constexpr = MXFP_BLOCK_SIZE + if is_w_microscaled: + w_type: tl.constexpr = W.dtype.element_ty + is_mxfp4: tl.constexpr = w_type == tl.uint8 + tl.static_assert(w_type == tl.uint8 or (w_type == tl.float8e4nv or w_type == tl.float8e5), + "mx_weight_ptr must be uint8 or fp8") + tl.static_assert(WMxScale.dtype.element_ty == tl.uint8, "mx_scale_ptr must be uint8") + tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, "BLOCK_K must be a multiple of MX_PACK_DIVISOR") + tl.static_assert(SWIZZLE_MX_VALUE == "HOPPER_VALUE" or SWIZZLE_MX_VALUE is None, "Only Hopper swizzling is supported for values") + else: + tl.static_assert(SWIZZLE_MX_VALUE is None) + tl.static_assert(SWIZZLE_MX_SCALE is None) + is_x_microscaled: tl.constexpr = XMxScale is not None + if is_x_microscaled: + x_type: tl.constexpr = X.dtype.element_ty + tl.static_assert(is_w_microscaled) + tl.static_assert(x_type == tl.float8e4nv, "mx_act_ptr must be float8e4nv") + tl.static_assert(XMxScale.dtype.element_ty == tl.uint8, "mx_scale_ptr must be uint8") + tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, "BLOCK_K must be a multiple of MX_PACK_DIVISOR") + is_out_microscaled: tl.constexpr = stride_y_mx_z is not None + + OUT_BLOCK_N: tl.constexpr = BLOCK_N // ACTIVATION_REDUCTION_N + yN = N // ACTIVATION_REDUCTION_N + + pid = tl.program_id(0) + if ExptOffsSum is not None and XCD_SWIZZLE > 1: + # Determine how much padding there is on the expert data. This allows us to + # know the true grid size and avoid processing padding tiles. + padding_m = grid_m - tl.load(ExptOffsSum) + else: + padding_m: tl.constexpr = 0 + + HAS_FUSED_SCATTER: tl.constexpr = WriteBackIndx is not None + index_type: tl.constexpr = tl.int64 if UPCAST_INDICES else tl.int32 + + unpadded_m = grid_m - padding_m + tl.assume(unpadded_m >= 0) + total_actual_tiles = batch_size * unpadded_m * grid_n * SPLIT_K + if padding_m > 0 and pid >= total_actual_tiles: + tl.device_assert(batch_size == 0) + pid_mn = pid - total_actual_tiles + if pid_mn < padding_m * grid_n: + pid_m, pid_n = swizzle2d(pid_mn, padding_m, grid_n, GROUP_M) + + # set masked out rows to 0 + if HAS_FUSED_SCATTER and N_EXPTS_ACT == 1: + _zero_masked_rows(pid_m, pid_n, Y, stride_y_m, stride_y_n, yN, ScatterSrcIndx, num_idxs, BLOCK_M, OUT_BLOCK_N) + return + + # swizzle program ids + pid_emnk = pid + if XCD_SWIZZLE != 1: + pid_emnk = xcd_swizzle(pid_emnk, total_actual_tiles, XCD_SWIZZLE) + pid_e = pid_emnk // (unpadded_m * grid_n * SPLIT_K) + pid_mnk = pid_emnk % (unpadded_m * grid_n * SPLIT_K) + pid_k = pid_mnk % SPLIT_K + pid_mn = pid_mnk // SPLIT_K + pid_m, pid_n = swizzle2d(pid_mn, unpadded_m, grid_n, GROUP_M) + # For split-k, advance to the output k slice + if SPLIT_K > 1: + Y += pid_k.to( index_type) * stride_y_k + if is_out_microscaled: + YActualScale += pid_k.to(index_type) * stride_x_mx_k + # set masked out rows to 0 + if HAS_FUSED_SCATTER and N_EXPTS_ACT == 1: + _zero_masked_rows(pid_m, pid_n, Y, stride_y_m, stride_y_n, yN, ScatterSrcIndx, num_idxs, BLOCK_M, OUT_BLOCK_N) + # unpack expert data + if ExptData is None: + tl.static_assert(M is not None) + expt_id, start_z, start_m, block_id = pid_e, pid_e, 0, pid_m + else: + tl.static_assert(M is None) + expt_data = tl.load(ExptData + pid_m) + if expt_data == -1: + return + expt_id = expt_data & 0x0000FFFF + block_id = expt_data >> 16 + M = tl.load(ExptHist + expt_id) + start_m = tl.load(ExptOffs + expt_id) + start_z = 0 + expt_id, block_id = expt_id.to(index_type), block_id.to(index_type) + start_m, start_z = start_m.to(index_type), start_z.to(index_type) + pid_n, pid_k = pid_n.to(index_type), pid_k.to(index_type) + # A pointers + offs_x_m = BLOCK_M * block_id + tl.arange(0, BLOCK_M) + offs_x_m = tl.max_contiguous(tl.multiple_of(offs_x_m % M, BLOCK_M), BLOCK_M) + X += start_z * stride_x_z + if GatherIndx is None: + X += start_m * stride_x_m + else: + GatherIndx += start_m + # no needs to bounds-check here because `offs_x_m` wraps around M dim + offs_x_m = tl.load(GatherIndx + offs_x_m) // N_EXPTS_ACT + offs_k = BLOCK_K * pid_k + tl.arange(0, BLOCK_K) + XPtrs = X + offs_x_m.to(index_type)[:, None] * stride_x_m + offs_k.to(index_type)[None, :] * stride_x_k + + # TODO: refactor if/else when triton front end improves + if is_w_microscaled: + if SWIZZLE_MX_VALUE == "HOPPER_VALUE": + tl.static_assert(is_mxfp4, "Only mxfp4 is supported for HOPPER swizzling") + tl.static_assert(not is_x_microscaled) + # We have pack 2 fp4 values in a byte but we divide the dimension by 2 + # when swizzling + W_K_DIVISOR: tl.constexpr = 1 + W_K_MULTIPLIER: tl.constexpr = 2 + W_N_DIVISOR: tl.constexpr = 4 + else: + # We have pack 2 fp4 values in a byte + W_K_DIVISOR: tl.constexpr = 2 if is_mxfp4 else 1 + W_K_MULTIPLIER: tl.constexpr = 1 + W_N_DIVISOR: tl.constexpr = 1 + + if W_TRANSPOSE: + # When weight is transposed, 2 fp4 values are packed per Byte along + # the contiguous dimension, K. + PACKED_BLOCK_K_W: tl.constexpr = (BLOCK_K // W_K_DIVISOR) * W_K_MULTIPLIER + PACKED_BLOCK_N_W: tl.constexpr = BLOCK_N // W_N_DIVISOR + else: + # When weight is not transposed, fp4 values are *not* packed along + # the contiguous dimension, N. + PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K + PACKED_BLOCK_N_W: tl.constexpr = BLOCK_N // W_K_DIVISOR + MX_SCALE_BLOCK_K: tl.constexpr = BLOCK_K // MX_PACK_DIVISOR + + WMxScale += expt_id * stride_w_mx_e + + if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE": + # TODO: support non W_TRANSPOSE with blackwell swizzling + tl.static_assert(W_TRANSPOSE) + tl.static_assert(BLOCK_N % 128 == 0) + tl.static_assert(MX_SCALE_BLOCK_K % 4 == 0) + PACKED_MX_BLOCK: tl.constexpr = (MX_SCALE_BLOCK_K // 4) * 32 * 4 * 4 + SCALE_BLOCK_N: tl.constexpr = BLOCK_N // 128 + stride_scale_k: tl.constexpr = 1 + elif SWIZZLE_MX_SCALE == "HOPPER_SCALE": + # TODO: support non W_TRANSPOSE with Hopper swizzling + tl.static_assert(W_TRANSPOSE) + n_warps: tl.constexpr = tl.extra.cuda.num_warps() + tl.static_assert(BLOCK_N % (2 * n_warps * 2 * 8) == 0) + tl.static_assert(MX_SCALE_BLOCK_K % 2 == 0) + PACKED_MX_BLOCK: tl.constexpr = MX_SCALE_BLOCK_K * 32 + SCALE_BLOCK_N: tl.constexpr = BLOCK_N // 32 + stride_scale_k = stride_w_mx_k + elif SWIZZLE_MX_SCALE == "CDNA4_SCALE": + tl.static_assert(stride_w_mx_k is not None) + tl.static_assert(stride_w_mx_n is not None) + NON_K_PRESHUFFLE_BLOCK_SIZE: tl.constexpr = 32 + PACKED_MX_BLOCK: tl.constexpr = MX_SCALE_BLOCK_K * NON_K_PRESHUFFLE_BLOCK_SIZE + SCALE_BLOCK_N: tl.constexpr = BLOCK_N // NON_K_PRESHUFFLE_BLOCK_SIZE + stride_scale_k = stride_w_mx_k + else: + PACKED_MX_BLOCK: tl.constexpr = MX_SCALE_BLOCK_K + SCALE_BLOCK_N: tl.constexpr = BLOCK_N + stride_scale_k = stride_w_mx_k + offs_n_scale = (pid_n * SCALE_BLOCK_N + tl.arange(0, SCALE_BLOCK_N)) % N + offs_n_scale = tl.max_contiguous(tl.multiple_of(offs_n_scale, SCALE_BLOCK_N), SCALE_BLOCK_N) + # K dimension must be the last dimension for the scales + offs_k_scale = PACKED_MX_BLOCK * pid_k + tl.arange(0, PACKED_MX_BLOCK) + WMxScalePtrs = WMxScale + offs_k_scale.to(index_type)[None, :] * stride_scale_k + offs_n_scale.to(index_type)[:, None] * stride_w_mx_n + else: + WMxScalePtrs = None + offs_k_scale = None + W_K_DIVISOR: tl.constexpr = 1 + W_K_MULTIPLIER: tl.constexpr = 1 + W_N_DIVISOR: tl.constexpr = 1 + PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K + PACKED_BLOCK_N_W: tl.constexpr = BLOCK_N + + # B pointers + offs_w_n = pid_n * PACKED_BLOCK_N_W + tl.arange(0, PACKED_BLOCK_N_W) + offs_w_n = tl.max_contiguous(tl.multiple_of(offs_w_n % (N // W_N_DIVISOR), PACKED_BLOCK_N_W), PACKED_BLOCK_N_W) + + if is_x_microscaled: + XMxScale += start_z.to(index_type) * stride_x_mx_z + if GatherIndx is None: + XMxScale += start_m * stride_x_mx_m + offs_x_k_scale = MX_SCALE_BLOCK_K * pid_k + tl.arange(0, MX_SCALE_BLOCK_K) + XMxScalePtrs = XMxScale + offs_x_m.to(index_type)[:, None] * stride_x_mx_m + offs_x_k_scale.to(index_type)[None, :] * stride_x_mx_k + else: + XMxScalePtrs = None + + offs_w_k = PACKED_BLOCK_K_W * pid_k + tl.arange(0, PACKED_BLOCK_K_W) + W += expt_id * stride_w_e + WPtrs = W + (offs_w_k.to(index_type)[:, None] * stride_w_k + offs_w_n.to(index_type)[None, :] * stride_w_n) + # compute output + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(K, BLOCK_K * pid_k, -(BLOCK_K * SPLIT_K)): + if EVEN_K: + mask_k = tl.full([BLOCK_K], True, dtype=tl.int1) + mask_k_w = tl.full([PACKED_BLOCK_K_W], True, dtype=tl.int1) + if is_w_microscaled and SWIZZLE_MX_SCALE is None: + mask_k_scale = tl.full([PACKED_MX_BLOCK], True, dtype=tl.int1) + if is_x_microscaled: + mask_x_k_scale = tl.full([MX_SCALE_BLOCK_K], True, dtype=tl.int1) + else: + mask_k = offs_k < k + mask_k_w = offs_w_k < ((k // (W_K_DIVISOR if W_TRANSPOSE else 1)) * W_K_MULTIPLIER) + if is_w_microscaled and SWIZZLE_MX_SCALE is None: + mask_k_scale = offs_k_scale * MX_PACK_DIVISOR < k + if is_x_microscaled: + mask_x_k_scale = offs_x_k_scale * MX_PACK_DIVISOR < k + + x = tl.load(XPtrs, mask=mask_k[None, :], other=0.0) + w = tl.load(WPtrs, mask=mask_k_w[:, None], other=0.0, cache_modifier=W_CACHE_MODIFIER) + if is_w_microscaled: + x_format: tl.constexpr = get_scaled_dot_format_string(x.dtype) + w_format: tl.constexpr = get_scaled_dot_format_string(w.dtype) + + if is_x_microscaled: + x_scales = tl.load(XMxScalePtrs, mask=mask_x_k_scale[None, :]) + elif x_format == "fp16" or x_format == "bf16": + x_scales: tl.constexpr = None + else: + # Scale of 1 in E8M0 format + x_scales = tl.full((BLOCK_M, MX_SCALE_BLOCK_K), 127, dtype=tl.uint8) + + if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE": + w_scales = unswizzle_mx_scale_bw(tl.load(WMxScalePtrs)) + elif SWIZZLE_MX_SCALE == "HOPPER_SCALE": + # Handshake with the swizzling code + num_warps: tl.constexpr = tl.extra.cuda.num_warps() + w_scales = unswizzle_mxfp4_scale_hopper(tl.load(WMxScalePtrs), mx_axis=1, num_warps=num_warps) + elif SWIZZLE_MX_SCALE == "CDNA4_SCALE": + w_scales = unswizzle_mx_scale_cdna4(tl.load(WMxScalePtrs), BLOCK_N, MX_SCALE_BLOCK_K) + else: + w_scales = tl.load(WMxScalePtrs, mask=mask_k_scale[None, :]) + + if SWIZZLE_MX_VALUE == "HOPPER_VALUE": + # Handshake with the swizzling code + tl.static_assert(x_format == "bf16") + tl.static_assert(w_format == "e2m1") + w = mxfp4_to_bf16_triton(w.trans(), w_scales, 1) + tl.static_assert(w.dtype == tl.bfloat16) + acc = acc.trans() + x = x.trans() + # w = w.trans() + acc = tl.dot(w, x, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32) + acc = acc.trans() + else: + rhs_k_pack: tl.constexpr = W_TRANSPOSE or not is_w_microscaled or W_K_DIVISOR != 2 + acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, w_format, acc=acc, fast_math=True, rhs_k_pack=rhs_k_pack) + if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE": + WMxScalePtrs += (MX_SCALE_BLOCK_K // 4 * SPLIT_K) * stride_w_mx_k + else: + WMxScalePtrs += (PACKED_MX_BLOCK * SPLIT_K) * stride_w_mx_k + if is_x_microscaled: + XMxScalePtrs += (MX_SCALE_BLOCK_K * SPLIT_K) * stride_x_mx_k + else: + acc = tl.dot(x, w, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32) + XPtrs += (BLOCK_K * SPLIT_K) * stride_x_k + WPtrs += (PACKED_BLOCK_K_W * SPLIT_K) * stride_w_k + # bias + scale + offs_m = BLOCK_M * block_id + tl.arange(0, BLOCK_M) + offs_y_n = BLOCK_N * pid_n + tl.arange(0, BLOCK_N) + mask_m = offs_m < M + mask_n = offs_y_n < N + if B is not None: + BPtrs = B + expt_id * stride_b_e + offs_y_n + if pid_k == 0: + bias = tl.load(BPtrs, mask=mask_n, other=0) + else: + bias = tl.full([BLOCK_N], 0, dtype=tl.float32) + else: + bias = tl.full([BLOCK_N], 0, dtype=tl.float32) + if Betas is not None: + betas = tl.load(Betas + start_m + offs_m, mask=mask_m, other=0.0) + else: + betas = tl.full([BLOCK_M], 1, dtype=tl.float32) + if Gammas is not None: + gammas = tl.load(Gammas + start_m + offs_m, mask=mask_m, other=0.0) + else: + gammas = tl.full([BLOCK_M], 1, dtype=tl.float32) + # flexpoint + x_scale = load_scale(XScale) + if PER_BATCH_SCALE: + w_scale = load_scale(WScale + expt_id) + else: + w_scale = load_scale(WScale) + acc *= x_scale * w_scale + acc = acc + bias[None, :] * betas[:, None] + if out_alpha is not None: + acc *= out_alpha + if ACTIVATION_FN is not None: + out = ACTIVATION_FN(acc, *activation_fn_args) + tl.static_assert(out.shape[1] == OUT_BLOCK_N, f"Activation fn out.shape[1] ({out.shape[1]}) doesn't match computed OUT_BLOCK_N ({OUT_BLOCK_N})") + offs_y_n = OUT_BLOCK_N * pid_n + tl.arange(0, OUT_BLOCK_N) + mask_n = offs_y_n < yN + else: + tl.static_assert(ACTIVATION_REDUCTION_N == 1, "Activation reduction must be 1 if no activation fn is provided") + out = acc + out *= gammas[:, None] + # write-back + Y += start_z.to(index_type) * stride_y_z + if WriteBackIndx is not None: + WriteBackIndx += start_m + dst_idx = tl.load(WriteBackIndx + offs_m, mask=start_m + offs_m < writeback_size, other=-1) + mask_m = mask_m & (dst_idx != -1) + offs_y_m = dst_idx + else: + Y += start_m * stride_y_m + offs_y_m = offs_m + + YPtrs = Y + offs_y_m.to(index_type)[:, None] * stride_y_m + offs_y_n.to(index_type)[None, :] * stride_y_n + mask = mask_m[:, None] & mask_n[None, :] + if is_out_microscaled: + MX_SCALE_BLOCK_N: tl.constexpr = BLOCK_N // MXFP_BLOCK_SIZE + N_MX_BLOCK: tl.constexpr = tl.cdiv(N, MXFP_BLOCK_SIZE) + tl.static_assert(EPILOGUE_FN is not None) + out, out_scale = EPILOGUE_FN(out, mask, *epilogue_fn_args) + tl.static_assert(BLOCK_N % MX_SCALE_BLOCK_N == 0, "") + offs_y_n_scale = MX_SCALE_BLOCK_N * pid_n + tl.arange(0, MX_SCALE_BLOCK_N) + mask_n_scale = offs_y_n_scale < N_MX_BLOCK + YActualScale += start_z.to(index_type) * stride_y_mx_z + if WriteBackIndx is None: + YActualScale += start_m * stride_y_mx_m + YActualScalePtrs = YActualScale + offs_y_m.to(index_type)[:, None] * stride_y_mx_m + offs_y_n_scale.to(index_type)[None, :] * stride_y_mx_n + else: + YActualScalePtrs = YActualScale + (offs_y_m - NRows).to(index_type)[:, None] * stride_y_mx_m + offs_y_n_scale.to(index_type)[None, :] * stride_y_mx_n + tl.store(YActualScalePtrs, out_scale, mask=mask_m[:, None] & mask_n_scale[None, :]) + else: + out = float_to_flex(out, YExpectedScale, YActualScale, YChecksumScale, mask, Y, FLEXPOINT_SATURATE_INF) + if EPILOGUE_FN is not None and not IS_EPILOGUE_QUANT_MXFP8: + out = EPILOGUE_FN(out, *epilogue_fn_args, target_dtype=YPtrs.dtype.element_ty) + tl.store(YPtrs, out, mask=mask) diff --git a/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py new file mode 100644 index 0000000000..ac5ae147fc --- /dev/null +++ b/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py @@ -0,0 +1,475 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py +# Triton is licensed under the MIT License. + +# isort: off +# fmt: off +import torch +import triton +import triton.language as tl +from triton.tools.ragged_tma import load_ragged, store_ragged +from triton_kernels import target_info +from triton_kernels.tensor_details.layout_details.blackwell_scale import unswizzle_mx_scale_bw +from triton_kernels.numerics_details.flexpoint import ( + float_to_flex, + load_scale, + nan_propagating_absmax_reduce, + compute_scale, +) +from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE +from ._common import make_matmul_repr, matmul_launch_metadata, swizzle2d, xcd_swizzle, get_scaled_dot_format_string + + +@triton.constexpr_function +def cuda_capability_geq(major, minor): + return target_info.cuda_capability_geq(major, minor) + +@triton.constexpr_function +def get_dtype(tensor_or_desc: tl.tensor | tl.tensor_descriptor) -> tl.dtype: + if isinstance(tensor_or_desc, tl.tensor): + return tensor_or_desc.dtype.element_ty + elif isinstance(tensor_or_desc, tl.tensor_descriptor): + return tensor_or_desc.dtype + else: + raise ValueError(f"Invalid type: {type(tensor_or_desc)}") + +@triton.jit +def _load_tile_attrs( + tile_id, num_tiles, grid_m, grid_n, padding_m, + M, ExptData, ExptHist, ExptOffs, + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, SPLIT_K: tl.constexpr, + GROUP_M: tl.constexpr, XCD_SWIZZLE: tl.constexpr): + # unpack and swizzle program ids + pid_emnk = tile_id + if XCD_SWIZZLE != 1: + pid_emnk = xcd_swizzle(pid_emnk, num_tiles // SPLIT_K, XCD_SWIZZLE) + pid_e = pid_emnk // ((grid_m - padding_m) * grid_n * SPLIT_K) + pid_mnk = pid_emnk % ((grid_m - padding_m) * grid_n * SPLIT_K) + if SPLIT_K > 1: + pid_k = pid_mnk % SPLIT_K + pid_mn = pid_mnk // SPLIT_K + else: + pid_k: tl.constexpr = 0 + pid_mn = pid_mnk + pid_m, pid_n = swizzle2d(pid_mn, (grid_m - padding_m), grid_n, GROUP_M) + + # unpack expert data + if ExptData is None: + tl.static_assert(M is not None) + expt_id, start_z, start_m, block_id, eM = pid_e, pid_e, 0, pid_m, -1 + else: + tl.static_assert(M is None) + expt_data = tl.load(ExptData + pid_m) + expt_id = expt_data & 0x0000FFFF + block_id = expt_data >> 16 + eM = tl.load(ExptHist + expt_id) + start_m = tl.load(ExptOffs + expt_id) + start_z = 0 + + off_m = BLOCK_M * block_id + off_n = BLOCK_N * pid_n + + return expt_id, start_z, start_m, eM, off_m, off_n, pid_k + +@triton.jit +def _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, offs, mask): + mask = mask & (offs < writeback_size) + offs = tl.load(WriteBackIndx + offs, mask=mask, other=-1) + mask = offs != -1 + return (offs, mask) + + +_matmul_ogs_repr = make_matmul_repr("_p_matmul_ogs", [0, 1, 2]) +@triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"], + repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata) +def _p_matmul_ogs( + Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n, + YExpectedScale, YActualScale, YChecksumScale, + stride_y_mx_z, stride_y_mx_m, stride_y_mx_n, + X, XPtr, stride_x_z, stride_x_m, stride_x_k, + XScale, + XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k, + W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr, + WScale, + MxScale, stride_mx_e, stride_mx_k, stride_mx_n, + B, stride_b_e, # Bias + NRows, M, N, K, # shapes + # expt data + Betas, Gammas, + GatherIndx, + ScatterSrcIndx, num_idxs, + WriteBackIndx, writeback_size, + ExptHist, ExptOffs, ExptOffsSum, ExptData, + # true grid size + batch_size, grid_m, grid_n, + # Out scale + out_alpha, + # fused activation function + ACTIVATION_FN: tl.constexpr, activation_fn_args, ACTIVATION_REDUCTION_N: tl.constexpr, + # epilogue transform + EPILOGUE_FN: tl.constexpr, epilogue_fn_args, + # MoE config + N_EXPTS_TOT: tl.constexpr, N_EXPTS_ACT: tl.constexpr, + # precision config + MAX_NUM_IMPRECISE_ACC: tl.constexpr, ALLOW_TF32: tl.constexpr, + FLEXPOINT_SATURATE_INF: tl.constexpr, + PER_BATCH_SCALE: tl.constexpr, + # optimization config + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, XCD_SWIZZLE: tl.constexpr, + # NYI: Must be None + SWIZZLE_MX_VALUE: tl.constexpr, + # One of ["BLACKWELL", None] + SWIZZLE_MX_SCALE: tl.constexpr, + EPILOGUE_SUBTILE: tl.constexpr, + EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, + W_CACHE_MODIFIER: tl.constexpr, + NUM_SMS: tl.constexpr, + X_TMA_MODE: tl.constexpr, + Y_TMA_MODE: tl.constexpr, + TOKENS_PER_EXPT_FOR_ANNOTATION=None, + UPCAST_INDICES:tl.constexpr=False, + SWAP_XW: tl.constexpr = False, + IS_EPILOGUE_QUANT_MXFP8: tl.constexpr = False): + # tl.static_assert(SWIZZLE_MX_VALUE is None, "NYI. Value swizzling") + + # why is this faster than using host-side tensor descriptor?! + if Y_TMA_MODE is not None: + Y = tl.make_tensor_descriptor(YPtr, Y.shape, Y.strides[:-1] + (1,), Y.block_shape) + + is_microscaled_format: tl.constexpr = MxScale is not None + tl.static_assert(not is_microscaled_format or W_TRANSPOSE, "NYI. Non-transposed mxfp4 weights") + MX_PACK_DIVISOR: tl.constexpr = MXFP_BLOCK_SIZE + if is_microscaled_format: + w_type: tl.constexpr = get_dtype(W) + tl.static_assert(w_type == tl.uint8 or (w_type == tl.float8e4nv or w_type == tl.float8e5), + "mx_weight_ptr must be uint8") + tl.static_assert(get_dtype(MxScale) == tl.uint8, "mx_scale_ptr must be uint8") + tl.static_assert(BLOCK_K % MX_PACK_DIVISOR == 0, "BLOCK_K must be a multiple of MX_PACK_DIVISOR") + tl.static_assert(SWIZZLE_MX_SCALE == "BLACKWELL_SCALE" or SWIZZLE_MX_SCALE is None, "Only Blackwell swizzling is supported for scales") + + # We have pack 2 fp4 values in a byte + W_PACK_DIVISOR: tl.constexpr = 2 if w_type == tl.uint8 else 1 + PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K // W_PACK_DIVISOR + MX_SCALE_BLOCK_K: tl.constexpr = BLOCK_K // MX_PACK_DIVISOR + else: + W_PACK_DIVISOR: tl.constexpr = 1 + MX_SCALE_BLOCK_K: tl.constexpr = 1 + PACKED_BLOCK_K_W: tl.constexpr = BLOCK_K + tl.static_assert(SWIZZLE_MX_SCALE is None) + + if ExptOffsSum is not None: + # Determine how much padding there is on the expert data. This allows us to + # know the true grid size and avoid processing padding tiles. + padding_m = grid_m - tl.load(ExptOffsSum) + else: + padding_m: tl.constexpr = 0 + + index_type: tl.constexpr = tl.int64 + + USE_FLEXPOINT_SCALE: tl.constexpr = YActualScale is not None or YChecksumScale is not None + HAS_SCATTER: tl.constexpr = WriteBackIndx is not None + HAS_GATHER: tl.constexpr = GatherIndx is not None + USE_GATHER_TMA: tl.constexpr = HAS_GATHER and X_TMA_MODE == "dense" + USE_SCATTER_TMA: tl.constexpr = HAS_SCATTER and Y_TMA_MODE == "dense" + + if EPILOGUE_SUBTILE is None: + SUBTILE_FACTOR: tl.constexpr = 1 + else: + SUBTILE_FACTOR: tl.constexpr = EPILOGUE_SUBTILE + EPILOGUE_BLOCK_N: tl.constexpr = BLOCK_N // SUBTILE_FACTOR + OUT_BLOCK_N: tl.constexpr = EPILOGUE_BLOCK_N // ACTIVATION_REDUCTION_N + yN = N // ACTIVATION_REDUCTION_N + + # set masked out rows to 0 + if HAS_SCATTER and N_EXPTS_ACT == 1: + # Iterate with reversed pids so that later pids will get more tiles if the number of + # tiles isn't evenly divisible by the number of SMs. + # The main loop after this iterates in the forward direction such that earlier + # pids get more tiles if the number of tiles isn't evenly divisible. + # This helps balance the work across the SMs. + for pid_mnk in range(NUM_SMS - tl.program_id(0) - 1, batch_size * grid_m * grid_n * SPLIT_K, NUM_SMS): + pid_k = pid_mnk % SPLIT_K + pid_mn = pid_mnk // SPLIT_K + pid_m, pid_n = swizzle2d(pid_mn, grid_m, grid_n, GROUP_M) + + z = tl.zeros([BLOCK_M, BLOCK_N // ACTIVATION_REDUCTION_N], dtype=tl.float32) + offs_m = z.shape[0] * pid_m + tl.arange(0, z.shape[0]) + offs_n = z.shape[1] * pid_n + tl.arange(0, z.shape[1]) + src_idx = tl.load(ScatterSrcIndx + offs_m, mask=offs_m < num_idxs, other=0) + YPtrs = YPtr + offs_m.to(index_type)[:, None] * stride_y_m + offs_n[None, :] * stride_y_n + mask_n = offs_n < yN + mask = (src_idx == -1)[:, None] & mask_n[None, :] + tl.store(YPtrs + pid_k * stride_y_k, z, mask=mask) + + + k_tiles = tl.cdiv(K, BLOCK_K * SPLIT_K) + num_tiles = batch_size * (grid_m - padding_m) * grid_n * SPLIT_K + + # If true, do not share loop-carried variables between the prologue and the + # epilogue to enable better pipelining with mmav5 + INDEPENDENT_EPILOGUE: tl.constexpr = cuda_capability_geq(10, 0) + + # start negative; will be incremented at the top of the loop + if INDEPENDENT_EPILOGUE: + tile_id1 = tl.program_id(0) - NUM_SMS + + # Keep track of local max for updating flexpoint scales. + THREADS_PER_BLOCK: tl.constexpr = tl.extra.cuda.num_threads() + local_absmax = tl.full([THREADS_PER_BLOCK], 0.0, tl.uint32) + + DISALLOW_ACC_MULTI_BUFFER: tl.constexpr = is_microscaled_format and BLOCK_M * BLOCK_N >= 128 * 256 + + for tile_id in tl.range(tl.program_id(0), num_tiles, NUM_SMS, flatten=True, disallow_acc_multi_buffer=DISALLOW_ACC_MULTI_BUFFER, warp_specialize=True): + expt_id, start_z, start_m, eM, off_m, off_n, pid_k = _load_tile_attrs( + tile_id, num_tiles, grid_m, grid_n, padding_m, + M, ExptData, ExptHist, ExptOffs, + BLOCK_M, BLOCK_N, SPLIT_K, + GROUP_M, XCD_SWIZZLE) + + # Base pointers and offsets. + if X_TMA_MODE is None: + XBase = X + start_z.to(index_type) * stride_x_z + offs_x_k = tl.arange(0, BLOCK_K)[None, :] * stride_x_k + if SPLIT_K > 1: + offs_x_k += pid_k.to(index_type) * BLOCK_K * stride_x_k + + if USE_GATHER_TMA: + offs_m = off_m + tl.arange(0, BLOCK_M) + mask_m = offs_m < (M if M is not None else eM) + if ExptData is None: + offs_x_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m, mask=mask_m) + # Bump rows to account for the Z offset. + offs_x_m += start_z * (stride_x_z // stride_x_m) + offs_x_m = tl.where(mask_m, offs_x_m, -1) + else: + offs_x_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m, + mask=mask_m, other=-N_EXPTS_ACT) // N_EXPTS_ACT + elif X_TMA_MODE is None: + tl.static_assert(HAS_GATHER) + offs_m = off_m + tl.arange(0, BLOCK_M) + if M is not None: + offs_m = tl.max_contiguous(tl.multiple_of(offs_m % M, BLOCK_M), BLOCK_M) + else: + offs_m = tl.max_contiguous(tl.multiple_of(offs_m % eM, BLOCK_M), BLOCK_M) + # no needs to bounds-check here because `offs_m` wraps around M dim + offs_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m) // N_EXPTS_ACT + offs_x_m = offs_m.to(index_type)[:, None] * stride_x_m + + + acc = tl.zeros((BLOCK_N, BLOCK_M) if SWAP_XW else (BLOCK_M, BLOCK_N), dtype=tl.float32) + for ki in tl.range(k_tiles, disallow_acc_multi_buffer=DISALLOW_ACC_MULTI_BUFFER): + off_k = pid_k * BLOCK_K + ki * BLOCK_K * SPLIT_K + off_k_w = pid_k * PACKED_BLOCK_K_W + ki * PACKED_BLOCK_K_W * SPLIT_K + off_k_mx = pid_k * MX_SCALE_BLOCK_K + ki * MX_SCALE_BLOCK_K * SPLIT_K + + # --- load x --- + if USE_GATHER_TMA: + x = X.gather(offs_x_m, off_k) + elif X_TMA_MODE == "dense": + x = X.load([start_z, start_m + off_m, off_k]) + x = x.reshape(BLOCK_M, BLOCK_K) + elif X_TMA_MODE == "ragged": + x = load_ragged(X, start_m, eM, [start_z, off_m, off_k], ragged_dim=1) + x = x.reshape(BLOCK_M, BLOCK_K) + else: + tl.static_assert(X_TMA_MODE is None) + XPtrs = XBase + offs_x_m + offs_x_k + XBase += BLOCK_K * SPLIT_K * stride_x_k + mask_k = tl.arange(0, BLOCK_K) < K - off_k + if EVEN_K: + if SPLIT_K > 1: + x = tl.load(XPtrs, mask=mask_k[None, :], other=0.0) + else: + x = tl.load(XPtrs) + else: + x = tl.load(XPtrs, mask=mask_k[None, :], other=0.0) + + # --- load w --- + if W_TRANSPOSE: + w = tl.reshape(W.load([expt_id, off_n, off_k_w]), W.block_shape[1:]).T + else: + w = tl.reshape(W.load([expt_id, off_k_w, off_n]), W.block_shape[1:]) + + # --- load w_scale --- + if is_microscaled_format: + x_format: tl.constexpr = get_scaled_dot_format_string(x.dtype) + mx_format: tl.constexpr = get_scaled_dot_format_string(w.dtype) + if x_format == "fp16" or x_format == "bf16": + x_scales: tl.constexpr = None + else: + x_scales = tl.full((BLOCK_M, BLOCK_K // MX_PACK_DIVISOR), 127, dtype=tl.uint8) + if SWIZZLE_MX_SCALE == "BLACKWELL_SCALE": + flattened_expt_n_idx = expt_id * ((N + 127) // 128) + (off_n // 128) + w_scales = MxScale.load([0, flattened_expt_n_idx, pid_k * MX_SCALE_BLOCK_K // 4 + ki * (MX_SCALE_BLOCK_K // 4 * SPLIT_K), 0, 0]) + w_scales = w_scales.reshape((w_scales.shape[1], w_scales.shape[2] * w_scales.shape[-2] * w_scales.shape[-1])) + w_scales = unswizzle_mx_scale_bw(w_scales) + else: + w_scales = MxScale.load([expt_id, off_k_mx, off_n]) + w_scales = tl.reshape(w_scales, *w_scales.shape[1:]).T + + # --- update accumulator --- + if is_microscaled_format: + if SWAP_XW: + acc = tl.dot_scaled(w.T, w_scales, mx_format, x.T, x_scales, x_format, acc=acc, fast_math=True) + else: + acc = tl.dot_scaled(x, x_scales, x_format, w, w_scales, mx_format, acc=acc, fast_math=True) + else: + if SWAP_XW: + acc = tl.dot(w.T, x.T, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32) + else: + acc = tl.dot(x, w, acc, max_num_imprecise_acc=MAX_NUM_IMPRECISE_ACC, allow_tf32=ALLOW_TF32) + + if INDEPENDENT_EPILOGUE: + tile_id1 += NUM_SMS + expt_id1, start_z1, start_m1, eM1, off_m1, off_n1, pid_k1 = _load_tile_attrs( + tile_id1, num_tiles, grid_m, grid_n, padding_m, + M, ExptData, ExptHist, ExptOffs, + BLOCK_M, BLOCK_N, SPLIT_K, + GROUP_M, XCD_SWIZZLE) + else: + tile_id1, expt_id1, start_z1, start_m1, eM1 = tile_id, expt_id, start_z, start_m, eM + off_m1, off_n1, pid_k1 = off_m, off_n, pid_k + + offs_m = off_m1 + tl.arange(0, BLOCK_M) + mask_m = offs_m < (M if M is not None else eM1) + if USE_SCATTER_TMA: + offs_y_m, mask_m = _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, start_m1 + offs_m, mask_m) + MASK_ACC: tl.constexpr = USE_FLEXPOINT_SCALE + if SPLIT_K > 1: + # Compute the split k offset in number of rows, and add it to offs_y_m. + # This allows us to write to the correct slice in the output tensor while using + # a 2D TMA scatter. + tl.device_assert(stride_y_k // stride_y_m == tl.cdiv(stride_y_k, stride_y_m)) + split_k_row_offs = pid_k1 * (stride_y_k // stride_y_m) + offs_y_m = tl.where(mask_m, offs_y_m + split_k_row_offs, offs_y_m) + elif Y_TMA_MODE is None: + tl.static_assert(HAS_SCATTER) + offs_y_m, mask_m = _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, start_m1 + offs_m, mask_m) + MASK_ACC: tl.constexpr = USE_FLEXPOINT_SCALE + else: + offs_y_m = start_m1 + offs_m + MASK_ACC = False if USE_GATHER_TMA else USE_FLEXPOINT_SCALE + + # bias + scale + offs_y_n = off_n1 + tl.arange(0, BLOCK_N) + mask_n = offs_y_n < N + if B is not None: + BPtrs = B + expt_id1 * stride_b_e + offs_y_n + if pid_k1 == 0: + bias = tl.load(BPtrs, mask=mask_n, other=0) + else: + bias = tl.full([BLOCK_N], 0, dtype=tl.float32) + else: + bias = tl.full([BLOCK_N], 0, dtype=tl.float32) + if Betas is not None: + betas = tl.load(Betas + start_m1 + offs_m, mask=mask_m, other=0.0) + else: + betas = tl.full([BLOCK_M], 1, dtype=tl.float32) + if Gammas is not None: + gammas = tl.load(Gammas + start_m1 + offs_m, mask=mask_m, other=0.0) + else: + gammas = tl.full([BLOCK_M], 1, dtype=tl.float32) + x_scale = load_scale(XScale) + if PER_BATCH_SCALE: + w_scale = load_scale(WScale + expt_id1) + else: + w_scale = load_scale(WScale) + + accs = (acc,) + biases = (bias,) + + if SUBTILE_FACTOR >= 2: + acc0, acc1 = acc.reshape(BLOCK_M, 2, BLOCK_N // 2).permute(0, 2, 1).split() + accs = (acc0, acc1) + bias0, bias1 = bias.reshape(2, BLOCK_N // 2).permute(1, 0).split() + biases = (bias0, bias1) + + if SUBTILE_FACTOR >= 4: + acc00, acc01 = acc0.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split() + acc10, acc11 = acc1.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split() + accs = (acc00, acc01, acc10, acc11) + bias00, bias01 = bias0.reshape(2, BLOCK_N // 4).permute(1, 0).split() + bias10, bias11 = bias1.reshape(2, BLOCK_N // 4).permute(1, 0).split() + biases = (bias00, bias01, bias10, bias11) + + tl.static_assert(EPILOGUE_BLOCK_N == BLOCK_N // SUBTILE_FACTOR) + tl.static_assert(len(accs) == SUBTILE_FACTOR) + + for a_i in tl.static_range(len(accs)): + acc_tile = accs[a_i] + acc_tile *= x_scale * w_scale + + if SWAP_XW: + acc_tile = acc_tile.T + + acc_tile = acc_tile + biases[a_i][None, :] * betas[:, None] + if out_alpha is not None: + acc_tile *= out_alpha + + if ACTIVATION_FN is not None: + out = ACTIVATION_FN(acc_tile, *activation_fn_args) + tl.static_assert(out.shape[1] == OUT_BLOCK_N, f"Activation fn out.shape[1] ({out.shape[1]}) doesn't match computed OUT_BLOCK_N ({OUT_BLOCK_N})") + else: + tl.static_assert(ACTIVATION_REDUCTION_N == 1, "Activation reduction must be 1 if no activation fn is provided") + out = acc_tile + + out *= gammas[:, None] + + if MASK_ACC: + out = tl.where(mask_m[:, None], out, 0.0) + # Flexpoint + out_view = tl.reshape(out, [out.numel // THREADS_PER_BLOCK, THREADS_PER_BLOCK], can_reorder=True) + local_absmax = tl.maximum(local_absmax, nan_propagating_absmax_reduce(out_view, axis=0)) + out = float_to_flex( + out, YExpectedScale, + None, # ActualScale: local absmax is tracked and updated after the loop + YChecksumScale, + None, # mask: out is manually masked to 0 + YPtr, FLEXPOINT_SATURATE_INF + ) + if EPILOGUE_FN is not None: + out = EPILOGUE_FN(out, *epilogue_fn_args, target_dtype=YPtr.dtype.element_ty, pid=len(accs)*tile_id1 + a_i) + + out_off_n = off_n1 // ACTIVATION_REDUCTION_N + a_i * OUT_BLOCK_N + out = out.to(YPtr.dtype.element_ty) + if USE_SCATTER_TMA: + # Convert -1 offsets to INT_MAX. We do this by clearing the leading bit. Note that + # there shouldn't be any other negative values. + offs_y_m = (offs_y_m.to(tl.uint32, bitcast=True) & 0x7FFFFFFF).to(tl.int32, bitcast=True) + Y.scatter(out, offs_y_m, out_off_n) + elif Y_TMA_MODE == "dense": + out = tl.reshape(out, [1] + out.shape) + off_kz = pid_k * batch_size + start_z1 + Y.store([off_kz, off_m1, out_off_n], out) + elif Y_TMA_MODE == "ragged": + out = tl.reshape(out, [1] + out.shape) + store_ragged(Y, start_m1, eM1, [pid_k, off_m1, out_off_n], out, ragged_dim=1) + else: + tl.static_assert(Y_TMA_MODE is None) + offs_y_n = out_off_n + tl.arange(0, OUT_BLOCK_N) + mask_n = offs_y_n < yN + + YPtrs = YPtr + pid_k1.to(index_type) * stride_y_k + start_z1.to(index_type) * stride_y_z + offs_y_m.to(index_type)[:, None] * stride_y_m + offs_y_n[None, :] * stride_y_n + mask = mask_m[:, None] & mask_n[None, :] + tl.store(YPtrs, out, mask=mask) + + + # Update the flexpoint scales + if YActualScale is not None: + tl.atomic_max(YActualScale, compute_scale(local_absmax.to(tl.float32, bitcast=True), YPtr), sem="relaxed") + + +_per_device_alloc_fns = {} +def get_per_device_per_stream_alloc_fn(device): + if device not in _per_device_alloc_fns: + _per_stream_tensors = {} + def alloc_fn(size: int, alignment: int, stream): + assert alignment == 128 + if stream not in _per_stream_tensors or _per_stream_tensors[stream].numel() < size: + _per_stream_tensors[stream] = torch.empty(size, device=device, dtype=torch.int8) + _per_stream_tensors[stream].__hibernate__ = {"type": "ignore"} + return _per_stream_tensors[stream] + + _per_device_alloc_fns[device] = alloc_fn + return _per_device_alloc_fns[device] diff --git a/triton_kernels/matmul_ogs_details/_reduce_grouped.py b/triton_kernels/matmul_ogs_details/_reduce_grouped.py new file mode 100644 index 0000000000..7fe29bd40a --- /dev/null +++ b/triton_kernels/matmul_ogs_details/_reduce_grouped.py @@ -0,0 +1,98 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/_reduce_grouped.py +# Triton is licensed under the MIT License. + +from triton_kernels.numerics_details.flexpoint import float_to_flex, load_scale +from triton_kernels.numerics_details.mxfp import quantize_mxfp8_fn +import triton +import triton.language as tl + + +@triton.jit +def _reduce_grouped(X, stride_xb: tl.uint64, stride_xm: tl.uint64, stride_xn, # + XScale, # input scalar flex scale + Out, stride_om: tl.uint64, stride_on, # output tensor + OutExpectedScale, OutActualScale, OutChecksumScale, # output scalar flex scales + InIndx, B, N, # + XMxScale, stride_mxb: tl.uint64, + stride_mxs: tl.uint64, # optional per-32-col output MXFP scales (uint8) + OutMxScale, stride_omxs: tl.uint64, # optional per-32-col output MXFP scales (uint8) + # fused activation function + ACTIVATION_FN: tl.constexpr, activation_fn_args, ACTIVATION_REDUCTION_N: tl.constexpr, + # epilogue transform + EPILOGUE_FN: tl.constexpr, epilogue_fn_args, + # + HAS_IN_MX_SCALE: tl.constexpr, HAS_OUT_MX_SCALE: tl.constexpr, FLEXPOINT_SATURATE_INF: tl.constexpr, + K: tl.constexpr, BLOCK_N: tl.constexpr): + pid_t = tl.program_id(0) + BLOCK_N_OUT: tl.constexpr = BLOCK_N // ACTIVATION_REDUCTION_N + # persistent along N: single program on N, iterate tiles of size BLOCK_N + start = pid_t * K + # load indices into a tuple + if InIndx is None: + indxs = (pid_t, ) + else: + indxs = () + for i in tl.static_range(0, K): + indxs = indxs + (tl.load(InIndx + start + i), ) + # determine first valid topk row + fi = indxs[(K - 1)] + for i in tl.static_range(K - 2, -1, -1): + fi = tl.where(indxs[i] != -1, indxs[i], fi) + # record overwritten row index (may be -1 if none) + XPtrs = X + tl.arange(0, BLOCK_N) * stride_xn + OutPtrs = Out + tl.arange(0, BLOCK_N_OUT) * stride_on + if HAS_IN_MX_SCALE: + XScalePtrs = XMxScale + tl.arange(0, BLOCK_N // 32) * stride_xn + if HAS_OUT_MX_SCALE: + OutScalePtrs = OutMxScale + tl.arange(0, BLOCK_N_OUT // 32) * stride_on + x_scale = load_scale(XScale) + for n_curr in tl.range(0, N, BLOCK_N, num_stages=4): + acc = tl.zeros([BLOCK_N_OUT], dtype=tl.float32) + x_n_mask = tl.arange(0, BLOCK_N) < N - n_curr + x_n_mask_scale = tl.arange(0, BLOCK_N // 32) < tl.cdiv(N - n_curr, 32) + # accumulate contributions for this tile + for i in tl.static_range(0, K): + curr = tl.zeros([BLOCK_N], dtype=tl.float32) + # iterate over split_k partial values + for b in tl.range(0, B): + is_valid = indxs[i] != -1 + x_row_ptr = XPtrs + indxs[i] * stride_xm + b * stride_xb + vals = tl.load(x_row_ptr, mask=x_n_mask & is_valid, other=0.0) + vals = vals.to(tl.float32) + if HAS_IN_MX_SCALE: + scale_row_ptr = XScalePtrs + indxs[i] * stride_mxs + b * stride_mxb + scale = tl.load(scale_row_ptr, mask=x_n_mask_scale & is_valid, other=0.) + scale = (scale.to(tl.uint32) << 23).to(tl.float32, bitcast=True) + vals = vals.reshape([BLOCK_N // 32, 32]) + vals = (scale[:, None] * vals).reshape([BLOCK_N]) + curr += vals + # apply nonlinearity to split-k output + if ACTIVATION_FN is not None: + curr = ACTIVATION_FN(curr[None, :], *activation_fn_args) + curr = tl.reshape(curr, [curr.shape[-1]]) + # update final accumulator + acc += curr + acc *= x_scale + # Compute per-32-col MXFP scales for this tile if requested + Nrem = (N - n_curr) // ACTIVATION_REDUCTION_N + out_n_mask = tl.arange(0, BLOCK_N_OUT) < Nrem + out_n_mask_scale = tl.arange(0, BLOCK_N_OUT // 32) < tl.cdiv(Nrem, 32) + if HAS_OUT_MX_SCALE: + acc, acc_scale = quantize_mxfp8_fn(acc[None, :], out_n_mask[None, :]) + acc = tl.reshape(acc, [acc.shape[-1]]) + acc_scale = tl.reshape(acc_scale, [acc_scale.shape[-1]]) + # Convert to flexpoint output if configured (scalar scales) + acc = float_to_flex(acc, OutExpectedScale, OutActualScale, OutChecksumScale, None, Out, FLEXPOINT_SATURATE_INF) + # write-back for this tile + out_ptr = OutPtrs + pid_t * stride_om + tl.store(out_ptr, acc, mask=out_n_mask) + if HAS_OUT_MX_SCALE: + out_scale_ptr = OutScalePtrs + pid_t * stride_omxs + tl.store(out_scale_ptr, acc_scale, mask=out_n_mask_scale) + XPtrs += BLOCK_N * stride_xn + OutPtrs += BLOCK_N_OUT * stride_on + if HAS_IN_MX_SCALE: + XScalePtrs += BLOCK_N // 32 * stride_xn + if HAS_OUT_MX_SCALE: + OutScalePtrs += BLOCK_N_OUT // 32 * stride_xn diff --git a/triton_kernels/matmul_ogs_details/opt_flags.py b/triton_kernels/matmul_ogs_details/opt_flags.py new file mode 100644 index 0000000000..3d870d6427 --- /dev/null +++ b/triton_kernels/matmul_ogs_details/opt_flags.py @@ -0,0 +1,307 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py +# Triton is licensed under the MIT License. + +# isort: off +# fmt: off +from dataclasses import dataclass +import triton +from triton_kernels.target_info import get_cdna_version +import torch +from .opt_flags_details import opt_flags_amd, opt_flags_nvidia + + +@dataclass +class OptFlags: + block_m: int + block_n: int + block_k: int + num_warps: int + num_stages: int + group_m: int + xcd_swizzle: int + w_cache_modifier: str + split_k: int + is_persistent: bool + fused_scatter: bool + idle_sms: int + epilogue_subtile: int | None + arch: str + target_kernel_kwargs: dict + + def __post_init__(self): + if self.fused_scatter and self.split_k != 1: + raise ValueError("Not supported") + + +def make_default_opt_flags_amd( + out_dtype, + lhs_dtype, + rhs_dtype, + precision_config, + m, + n, + k, + routing_data, + can_use_persistent_tma, + can_use_fused_scatter, + enforce_bitwise_invariance, + epilogue_effective_itemsize, + constraints, +): + constraints_supported = ["block_m", "block_n", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile"] + assert not any([c not in constraints_supported for c in constraints]), constraints.keys() + # tokens per expert + if routing_data is None: + tokens_per_expt = m + elif routing_data.expected_tokens_per_expt is None: + tokens_per_expt = max(1, m // routing_data.n_expts_tot) + else: + tokens_per_expt = routing_data.expected_tokens_per_expt + + is_cdna4 = get_cdna_version() == 4 + # block_m + if constraints.get("block_m", None): + block_m = constraints["block_m"] + elif enforce_bitwise_invariance: + block_m = 256 if is_cdna4 else 128 + elif tokens_per_expt >= 512 and n >= 2048: + block_m = 256 if is_cdna4 else 128 + elif is_cdna4 and m >= 512: + block_m = 128 + else: + block_m = max(32, min(triton.next_power_of_2(tokens_per_expt), 64)) + + if routing_data is not None: + grid_m = routing_data.n_blocks(m, block_m) + else: + grid_m = triton.cdiv(m, block_m) + # group_m: + group_m = 4 + # number of xcds + num_xcds = 8 + xcd_swizzle = num_xcds + # block_nk: + block_n, block_k = opt_flags_amd.compute_block_nk( + n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, precision_config + ) + # Replace block_k if provided in constraints. + # TODO: Does opt_flags_amd.compute_block_nk need to be refactored? + if constraints.get("block_k", None) is not None: + block_k = constraints["block_k"] + if constraints.get("block_n", None) is not None: + block_n = constraints["block_n"] + is_persistent = constraints.get("is_persistent", False) + # split_k: + if constraints.get("split_k", None) is not None: + split_k = constraints["split_k"] + elif is_persistent or enforce_bitwise_invariance: + split_k = 1 + else: + grid_size = grid_m * ((n + block_n - 1) // block_n) + n_cu = torch.cuda.get_device_properties(0).multi_processor_count + split_k = max(1, n_cu // grid_size) + # w_cache_modifier: + w_cache_modifier = ".cg" if block_m <= 32 else None + # num_warps, num_stages + num_warps = 2 if (m is not None and m <= 16) else 8 + num_stages = 2 + # AMD-specific + target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1} + epilogue_subtile = constraints.get('epilogue_subtile', None) + if epilogue_subtile is None: + epilogue_subtile = 1 + ret = OptFlags( + block_m=block_m, + block_n=block_n, + block_k=block_k, + num_warps=num_warps, + num_stages=num_stages, + group_m=group_m, + xcd_swizzle=xcd_swizzle, + w_cache_modifier=w_cache_modifier, + split_k=split_k, + is_persistent=is_persistent, + fused_scatter=constraints.get('fused_scatter', False), + idle_sms=0, + epilogue_subtile=epilogue_subtile, + arch=None, + target_kernel_kwargs=target_kernel_kwargs, + ) + # check constraints + assert all(getattr(ret, ck) == cv for ck, cv in constraints.items() if cv is not None), f"{ret} != {constraints}" + return ret + +def make_default_opt_flags_nvidia( + out_dtype, + lhs_dtype, + rhs_dtype, + precision_config, + m, + n, + k, + routing_data, + can_use_persistent_tma, + can_use_fused_scatter, + enforce_bitwise_invariance, + epilogue_effective_itemsize, + constraints, +): + constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages", "idle_sms"] + assert not any([c not in constraints_supported for c in constraints]), constraints.keys() + # tokens per expert + if routing_data is None: + tokens_per_expt = m + elif routing_data.expected_tokens_per_expt is None: + tokens_per_expt = max(1, m // routing_data.n_expts_tot) + else: + tokens_per_expt = routing_data.expected_tokens_per_expt + # pid swizzling + group_m = 8 + xcd_swizzle = 1 + # block_m + if constraints.get("block_m", None): + block_m = constraints["block_m"] + elif enforce_bitwise_invariance: + block_m = 128 + else: + block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128)) + # block n + arch = None + block_n = opt_flags_nvidia.compute_block_n(n, arch, precision_config) + # is_persistent + grid_size = opt_flags_nvidia.compute_grid_size(routing_data, m, n, block_m, block_n) + n_sms = torch.cuda.get_device_properties(0).multi_processor_count + tiles_per_sm = grid_size / n_sms + supports_persistent = can_use_persistent_tma and (arch is None or int(arch[2:-1]) >= 9) + if constraints.get("is_persistent", None) is not None: + is_persistent = constraints["is_persistent"] + else: + has_simple_epilogue = precision_config.max_num_imprecise_acc is None + is_persistent = supports_persistent and has_simple_epilogue and (tiles_per_sm >= 2.0 or lhs_dtype.itemsize <= 1) and out_dtype.itemsize < 4 + # TEMP CHANGE + if precision_config.act_scale is not None or precision_config.out_scale is not None: + is_persistent = False + # block k + if constraints.get("block_k", None) is not None: + block_k = constraints["block_k"] + else: + block_k = opt_flags_nvidia.compute_block_k(m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config) + # split_k + if constraints.get("split_k", None) is not None: + split_k = constraints["split_k"] + elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None: + split_k = 1 + else: + estimated_actual_grid_size = opt_flags_nvidia.compute_grid_size(None, m, n, block_m, block_n) + split_k = opt_flags_nvidia.compute_split_k(block_k, k, estimated_actual_grid_size) + if split_k > 1: + # With split_k, results are written in f32. Use that for the following computations. + out_dtype = torch.float32 + compute_num_stages_args = ( + precision_config, + is_persistent, + + block_m, + block_n, + block_k, + out_dtype, + lhs_dtype, + rhs_dtype, + ) + + if constraints.get("epilogue_subtile", None) is not None: + subtiles_to_check = [constraints["epilogue_subtile"]] + else: + subtiles_to_check = [1, 2, 4] + num_stages = -1 + for ep in subtiles_to_check: + ns = opt_flags_nvidia.compute_num_stages(*compute_num_stages_args, ep, epilogue_effective_itemsize) + if ns > num_stages: + epilogue_subtile, num_stages = ep, ns + assert num_stages >= 1 + if constraints.get("num_stages", None): + num_stages = constraints["num_stages"] + # fused scatter scratchpad + if constraints.get("fused_scatter", None) is not None: + fused_scatter = constraints["fused_scatter"] + else: + fused_scatter = can_use_fused_scatter and split_k == 1 + # Handshake with the HBM swizzling + num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, precision_config) + ret = OptFlags( + block_m=block_m, + block_n=block_n, + block_k=block_k, + num_warps=num_warps, + num_stages=num_stages, + fused_scatter=fused_scatter, + group_m=group_m, + xcd_swizzle=xcd_swizzle, + w_cache_modifier=None, + split_k=split_k, + is_persistent=is_persistent, + epilogue_subtile=epilogue_subtile, + arch=arch, + target_kernel_kwargs=dict(), + idle_sms=constraints.get("idle_sms", 0), + ) + # check constraints + assert all(getattr(ret, ck) == cv for ck, cv in constraints.items() if cv is not None), f"{ret} != {constraints}" + return ret + +# -------------- +# User Interface +# -------------- + +_opt_flags_constraints: dict = dict() +_opt_flags: OptFlags | None = None + +def update_opt_flags_constraints(constraints: dict[str, int]): + global _opt_flags_constraints + _opt_flags_constraints.update(constraints) + +def reset_opt_flags_constraints(): + global _opt_flags_constraints + _opt_flags_constraints = dict() + +def set_opt_flags(opt_flags: OptFlags): + global _opt_flags + assert not _opt_flags_constraints, "setting constraints is incompatible with manual flags override" + assert not _opt_flags, "opt_flags already set; please reset to None first" + _opt_flags = opt_flags + +class InapplicableConstraint(Exception): + pass + +def make_opt_flags( + out_dtype, + lhs_dtype, + rhs_dtype, + precision_config, + m, + n, + k, + routing_data, + can_use_persistent_tma, + can_use_fused_scatter, + epilogue_effective_itemsize, +): + if _opt_flags_constraints.get("is_persistent", False) and not can_use_persistent_tma: + raise InapplicableConstraint("cannot enforce `is_persistent=True` constraint") + if _opt_flags_constraints.get("fused_scatter", False) and not can_use_fused_scatter: + raise InapplicableConstraint("cannot enforce `fused_scatter=True` constraint") + enforce_bitwise_invariance = precision_config.enforce_bitwise_invariance + if _opt_flags is not None: + assert not _opt_flags_constraints + return _opt_flags + args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, m, n, k, + routing_data, can_use_persistent_tma, can_use_fused_scatter, + enforce_bitwise_invariance, epilogue_effective_itemsize, + _opt_flags_constraints] + backend = triton.runtime.driver.active.get_current_target().backend + if backend == "hip": + return make_default_opt_flags_amd(*args) + if backend == "cuda": + return make_default_opt_flags_nvidia(*args) + assert False diff --git a/triton_kernels/matmul_ogs_details/opt_flags_details/__init__.py b/triton_kernels/matmul_ogs_details/opt_flags_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/matmul_ogs_details/opt_flags_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_amd.py b/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_amd.py new file mode 100644 index 0000000000..11d8102d47 --- /dev/null +++ b/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_amd.py @@ -0,0 +1,37 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_amd.py +# Triton is licensed under the MIT License. + +import torch +import triton +from triton_kernels.target_info import get_cdna_version +from triton_kernels.tensor import bitwidth + + +def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, precision_config): + lhs_width = bitwidth(lhs_dtype) / 8 + rhs_width = bitwidth(rhs_dtype) / 8 + + # block_n: + n_cu = torch.cuda.get_device_properties(0).multi_processor_count + if n is not None: + if n <= 128 and (n & (n - 1)) == 0: + block_n = n + else: + block_n = max(32, min(256, triton.next_power_of_2(grid_m * n * num_xcds // n_cu))) + elif block_m > 64: + block_n = 256 + else: + block_n = 128 + + if get_cdna_version() == 4 and block_m == 128: + block_n = 512 + + # block_k needs to match the cacheline size (128B) + block_k = int(128 // min(lhs_width, rhs_width)) + + # TODO: block_k = 128 seems to work better for now. + # perhaps due to increased number of k loops to pipeline + if precision_config.weight_scale is not None and get_cdna_version() != 4: + block_k = 128 + return block_n, block_k diff --git a/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py new file mode 100644 index 0000000000..9ae84646c9 --- /dev/null +++ b/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py @@ -0,0 +1,115 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py +# Triton is licensed under the MIT License. + +import torch +import triton +from triton_kernels import target_info +from triton_kernels.tensor import get_layout, bitwidth, FP4 +from triton_kernels.tensor_details.layout import HopperMXScaleLayout +from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE + + +def compute_grid_size(routing_data, m, n, block_m, block_n): + if routing_data is not None: + grid_m = routing_data.n_blocks(m, block_m) + else: + grid_m = triton.cdiv(m, block_m) + grid_n = (n + block_n - 1) // block_n + return grid_m * grid_n + + +def compute_block_n(n: int, arch, precision_config): + # block_n: + layout = get_layout(precision_config.weight_scale) + if isinstance(layout, HopperMXScaleLayout) and layout.num_warps == 4: + return 128 + elif precision_config.max_num_imprecise_acc is None and n > 128: + return 256 + else: + return max(16, min(128, triton.next_power_of_2(n))) + + +def compute_block_k(m: int, k: int | None, is_persistent: bool, lhs_dtype, rhs_dtype, precision_config): + lhs_width = bitwidth(lhs_dtype) + rhs_width = bitwidth(rhs_dtype) + # block_k needs to match the cacheline size (1024 bits) + block_k = int(1024 // min(lhs_width, rhs_width)) + has_native_mxfp = target_info.cuda_capability_geq(10, 0) + if rhs_width == 4 and not has_native_mxfp: + block_k = 128 + elif k is not None: + block_k = max(32, min(triton.next_power_of_2(k), block_k)) + has_mx_weight_scale = precision_config is not None and precision_config.weight_scale is not None + if has_native_mxfp and is_persistent and has_mx_weight_scale: + block_k = min(block_k, 128) + return block_k + + +def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int: + device_props = torch.cuda.get_device_properties(0) + n_sms = device_props.multi_processor_count + split_k = n_sms // grid_size + if k is not None: + # avoid split_k for small k + num_block_k = triton.cdiv(k, block_k) + split_k = min(split_k, num_block_k // 4) + split_k = max(split_k, 1) + return split_k + + +def compute_num_warps(block_m, block_n, precision_config): + layout = get_layout(precision_config.weight_scale) + if isinstance(layout, HopperMXScaleLayout): + return layout.num_warps + return max(block_m * block_n // 4096, 4) + + +def compute_num_stages( + precision_config, + is_persistent, + block_m, + block_n, + block_k, + out_dtype, + lhs_dtype, + rhs_dtype, + epilogue_subtile, + epilogue_effective_itemsize, +): + if precision_config.max_num_imprecise_acc is not None: + return 3 + weight_size = bitwidth(rhs_dtype) / 8 + stage_size = block_m * block_k * lhs_dtype.itemsize + block_k * block_n * weight_size + device_props = torch.cuda.get_device_properties(0) + smem_capacity = device_props.shared_memory_per_block_optin + has_native_mxfp = target_info.cuda_capability_geq(10, 0) + if has_native_mxfp and getattr(precision_config, "weight_scale", None) is not None: + if rhs_dtype == FP4: + # 4-bit e2m1 weights are padded 2x + # https://docs.nvidia.com/cuda/parallel-thread-execution/#packing-format-used-for-matrix-a-and-b-by-kind-mxf8f6f4-in-shared-memory + stage_size += block_k * block_n * weight_size + + if is_persistent: + # Per-stage wait barrier + stage_size += 8 + if target_info.cuda_capability_geq(10, 0): + acc_size = epilogue_effective_itemsize or out_dtype.itemsize + else: + acc_size = out_dtype.itemsize + if target_info.cuda_capability_geq(10, 0) and epilogue_subtile is not None: + acc_block_n = block_n // epilogue_subtile + else: + acc_block_n = block_n + # pipelined TMA store local to global, or + # pipelined layout conversion before store of the accumulator + # note: layout conversion has some padding + smem_capacity -= int((block_m + 4) * acc_block_n * acc_size) + if precision_config.weight_scale is not None: + # mx scales + stage_size += block_n * (block_k // int(MXFP_BLOCK_SIZE)) + elif has_native_mxfp: + # mx scales + stage_size += block_n * (block_k // int(MXFP_BLOCK_SIZE)) + num_stages = min(4, smem_capacity // int(stage_size)) + return num_stages diff --git a/triton_kernels/numerics.py b/triton_kernels/numerics.py new file mode 100644 index 0000000000..0bbe6b1b63 --- /dev/null +++ b/triton_kernels/numerics.py @@ -0,0 +1,46 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/numerics.py +# Triton is licensed under the MIT License. + +import torch +from dataclasses import dataclass + +MAX_FINITE_FLOAT8E5 = 57344.0 +MAX_FINITE_FLOAT8E4NV = 448.0 +MAX_FINITE_FLOAT8E4B8 = 240.0 + + +@dataclass(frozen=True) +class BaseFlexData: + dtype: torch.dtype | None = None + + def view(self, x: torch.Tensor): + if self.dtype is None: + return x + return x.view(self.dtype) + + def reinterpret(self, x): + if self.dtype is None or x.dtype.itemsize > 1: + return x + return x.view(self.dtype) + + +@dataclass(frozen=True) +class InFlexData(BaseFlexData): + scale: torch.Tensor | None = None + + @property + def is_per_batch(self): + return False if self.scale is None else len(self.scale) > 1 + + +@dataclass(frozen=True) +class OutFlexData(BaseFlexData): + expected_scale: torch.Tensor | None = None + actual_scale: torch.Tensor | None = None + checksum_scale: torch.Tensor | None = None + + def __iter__(self): + yield self.expected_scale + yield self.actual_scale + yield self.checksum_scale diff --git a/triton_kernels/numerics_details/__init__.py b/triton_kernels/numerics_details/__init__.py new file mode 100644 index 0000000000..06cad1246f --- /dev/null +++ b/triton_kernels/numerics_details/__init__.py @@ -0,0 +1,3 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/numerics_details/__init__.py +# Triton is licensed under the MIT License. diff --git a/triton_kernels/numerics_details/flexpoint.py b/triton_kernels/numerics_details/flexpoint.py new file mode 100644 index 0000000000..ad905d009d --- /dev/null +++ b/triton_kernels/numerics_details/flexpoint.py @@ -0,0 +1,194 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/numerics_details/flexpoint.py +# Triton is licensed under the MIT License. + +from ..numerics import MAX_FINITE_FLOAT8E4B8, MAX_FINITE_FLOAT8E4NV, MAX_FINITE_FLOAT8E5 +import triton +import triton.language as tl +from triton_kernels.target_info import cuda_capability_geq + +# ------------------------------- +# Kernels stuff +# ------------------------------- + +TL_MAX_FINITE_FLOAT8E5 = tl.constexpr(MAX_FINITE_FLOAT8E5) +TL_MAX_FINITE_FLOAT8E4NV = tl.constexpr(MAX_FINITE_FLOAT8E4NV) +TL_MAX_FINITE_FLOAT8E4B8 = tl.constexpr(MAX_FINITE_FLOAT8E4B8) +TL_MAX_FINITE_FLOAT8E4B15 = tl.constexpr(1.750) +TL_MAX_FINITE_FLOAT16 = tl.constexpr(65472.0) + +TL_RCP_MAX_FINITE_FLOAT8E5 = tl.constexpr(0x37924925) # 0x1.24924Ap-16 +TL_RCP_MAX_FINITE_FLOAT8E4NV = tl.constexpr(0x3B124925) # 0x1.24924Ap-9 +TL_RCP_MAX_FINITE_FLOAT8E4B8 = tl.constexpr(0x3B888889) # 0x1.111112p-8 +TL_RCP_MAX_FINITE_FLOAT8E4B15 = tl.constexpr(0x3F124925) # 0x1.24924Ap-1 +TL_RCP_MAX_FINITE_FLOAT16 = tl.constexpr(0x37802008) # 0x1.004010p-16 + + +@triton.jit +def max_finite(dtype): + if dtype == tl.constexpr(tl.float8e5): + return TL_MAX_FINITE_FLOAT8E5 + elif dtype == tl.constexpr(tl.float8e4nv): + return TL_MAX_FINITE_FLOAT8E4NV + elif dtype == tl.constexpr(tl.float8e4b8): + return TL_MAX_FINITE_FLOAT8E4B8 + elif dtype == tl.constexpr(tl.float8e4b15): + return TL_MAX_FINITE_FLOAT8E4B15 + elif dtype == tl.constexpr(tl.float16): + return TL_MAX_FINITE_FLOAT16 + else: + tl.static_assert(tl.constexpr(False), f"{dtype} not supported in flexpoint") + + +@triton.jit +def rcp_max_finite(dtype): + if dtype == tl.constexpr(tl.float8e5): + return TL_RCP_MAX_FINITE_FLOAT8E5 + elif dtype == tl.constexpr(tl.float8e4nv): + return TL_RCP_MAX_FINITE_FLOAT8E4NV + elif dtype == tl.constexpr(tl.float8e4b8): + return TL_RCP_MAX_FINITE_FLOAT8E4B8 + elif dtype == tl.constexpr(tl.float8e4b15): + return TL_RCP_MAX_FINITE_FLOAT8E4B15 + elif dtype == tl.constexpr(tl.float16): + return TL_RCP_MAX_FINITE_FLOAT16 + else: + tl.static_assert(tl.constexpr(False), f"{dtype} not supported in flexpoint") + + +@triton.jit +def sm86_min_nan_xorsign_abs_f32(a, b): + """Wrapper for min.NaN.xorsign.abs.f32 PTX instruction. + + Computes the minimum of the absolute values of the two inputs and sets its sign to the XOR of the signs of the inputs. + NaN inputs are propagated to the output. + + Requires CUDA compute capability 8.6+ (A100 and A30 Ampere GPUs don't support it, but A40/A16/A10/A2, Ada, and Hopper GPUs do). + """ + tl.static_assert(cuda_capability_geq(8, 6), "min.NaN.xorsign.abs.f32 requires CUDA compute capability 8.6+") + tl.static_assert(a.dtype == tl.float32, "min.NaN.xorsign.abs.f32 requires float32 inputs") + tl.static_assert(b.dtype == tl.float32, "min.NaN.xorsign.abs.f32 requires float32 inputs") + + return tl.inline_asm_elementwise( + """{ + min.NaN.xorsign.abs.f32 $0, $1, $2; + }""", + "=r,r,r", + [a, b], + dtype=tl.float32, + is_pure=True, + pack=1, + ) + + +@triton.jit +def sm86_max_nan_xorsign_abs_f32(a, b): + """Wrapper for max.NaN.xorsign.abs.f32 PTX instruction. + + Computes the maximum of the absolute values of the two inputs and sets its sign to the XOR of the signs of the inputs. + NaN inputs are propagated to the output. + + Requires CUDA compute capability 8.6+ (A100 and A30 Ampere GPUs don't support it, but A40/A16/A10/A2, Ada, and Hopper GPUs do). + """ + tl.static_assert(cuda_capability_geq(8, 6), "max.NaN.xorsign.abs.f32 requires CUDA compute capability 8.6+") + tl.static_assert(a.dtype == tl.float32, "max.NaN.xorsign.abs.f32 requires float32 inputs") + tl.static_assert(b.dtype == tl.float32, "max.NaN.xorsign.abs.f32 requires float32 inputs") + + return tl.inline_asm_elementwise( + """{ + max.NaN.xorsign.abs.f32 $0, $1, $2; + }""", + "=r,r,r", + [a, b], + dtype=tl.float32, + is_pure=True, + pack=1, + ) + + +@triton.jit +def load_scale(scale_ptr): + return 1.0 if scale_ptr is None else tl.load(scale_ptr) + + +@triton.jit +def flex_to_float(x, scale_ptr): + scale = load_scale(scale_ptr) + return x.to(tl.float32) * scale + + +@triton.jit +def clip(x, limit): + res = tl.minimum(x, limit) + res = tl.maximum(-limit, res) + return res + + +@triton.jit +def nan_propagating_absmax_reduce(x, axis=None): + if cuda_capability_geq(8, 6): + # abs-max-reduce as floating-point if `max.NaN.xorsign.abs.f32` is supported. + x_absmax = tl.reduce(x, axis, sm86_max_nan_xorsign_abs_f32) + # Note: sign of reduction result is the xor of signs of all inputs, explicitly clear the sign bit to fix it. + x_absmax = x_absmax.to(tl.uint32, bitcast=True) & 0x7FFFFFFF + else: + # Clear the sign bit, max-reduce as integer (same as NaN-propagating max-reduce as float) + masked_abs_x = x.to(tl.uint32, bitcast=True) & 0x7FFFFFFF + x_absmax = tl.max(masked_abs_x, axis) + + return x_absmax + + +@triton.jit +def compute_scale(x, Out): + x_absmax = nan_propagating_absmax_reduce(tl.ravel(x, can_reorder=True)) + + # atomic_max does not propagate NaNs, so we replace them with +inf (0x7f800000). + # We use integer minimum because NaNs are above +inf in integer representation. + x_absmax = tl.minimum(x_absmax, 0x7F800000).to(tl.float32, bitcast=True) + RCP_MAX_VALUE = rcp_max_finite(Out.dtype.element_ty) + return tl.fma(x_absmax, RCP_MAX_VALUE.to(tl.float32, bitcast=True), 1.0e-30) + + +@triton.jit +def update_scale(x, scale_ptr, Out) -> None: + if scale_ptr is not None: + scale = compute_scale(x, Out) + tl.atomic_max(scale_ptr, scale, sem="relaxed") + + +@triton.jit +def float_to_flex( + x, + expected_scale_ptr_or_val, + actual_scale_ptr, + checksum_scale_ptr, + mask, + Out, + saturate_infs: tl.constexpr, +): + if expected_scale_ptr_or_val is not None: + if expected_scale_ptr_or_val.dtype.is_ptr(): + invscale = 1.0 / tl.load(expected_scale_ptr_or_val) + else: + invscale = 1.0 / expected_scale_ptr_or_val + else: + invscale = 1.0 + if checksum_scale_ptr is not None: + x_int32 = x.to(tl.int32, bitcast=True) + zero = tl.cast(0.0, tl.int32) + if mask is not None: + x_int32 = tl.where(mask, x_int32, zero) + checksum_local = tl.xor_sum(tl.ravel(x_int32, can_reorder=True), 0) + tl.atomic_add(checksum_scale_ptr, checksum_local) + if mask is not None: + if actual_scale_ptr is not None: + x = tl.where(mask, x, 0.0) + update_scale(x, actual_scale_ptr, Out) + x = x * invscale + # if expected_scale_ptr is not None, we applied flexpoint scale. We only want to clip in this case. + if expected_scale_ptr_or_val is not None: + if saturate_infs: + CLIP_VALUE = max_finite(Out.dtype.element_ty) + x = clip(x, CLIP_VALUE) + return x diff --git a/triton_kernels/numerics_details/mxfp.py b/triton_kernels/numerics_details/mxfp.py new file mode 100644 index 0000000000..bd4fca8fc8 --- /dev/null +++ b/triton_kernels/numerics_details/mxfp.py @@ -0,0 +1,307 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/numerics_details/mxfp.py +# Triton is licensed under the MIT License. + +# isort: off +# fmt: off +from enum import Enum +import triton +import torch +import torch.nn.functional as F +from .mxfp_details._upcast_from_mxfp import _upcast_from_mxfp +from .mxfp_details._downcast_to_mxfp import _downcast_to_mxfp, MXFP_BLOCK_SIZE, _quantize_mxfp8_fn + +# ----------------------------------------------------------------------------- +# Dequantization / Quantization Utilities +# ----------------------------------------------------------------------------- + + +class DequantScaleRoundingMode(Enum): + ROUND_UP = 0 + ROUND_DOWN = 1 + + +def downcast_to_mxfp(src_tensor: torch.Tensor, out_quant_type: torch.dtype, axis: int, + DEQUANT_SCALE_ROUNDING_MODE: DequantScaleRoundingMode = DequantScaleRoundingMode.ROUND_UP): + """ + Convert the src weights to mx format. The src weight is quantized along the axis dimension. + + If weight_quant_type is torch.uint8, we output mxfp4 where two e2m1 values are packed into a single byte. + Note that this means the k_dim of the tensor will be half of the logical k_dim. + + If weight_quant_type is torch.float8_e4m3fn or torch.float8_e5m2, we output mxfp8 with the float8s are stored + in their respective formats. + """ + ndim = src_tensor.ndim + assert -ndim <= axis < ndim, f"Invalid axis {axis=}" + axis = axis if axis >= 0 else axis + ndim + # downcast + src_tensor = src_tensor.transpose(axis, src_tensor.ndim - 1) + is_fp4 = out_quant_type == torch.uint8 + is_fp8 = out_quant_type in (torch.float8_e4m3fn, torch.float8_e5m2) + assert is_fp4 or is_fp8 + divisor = 2 if is_fp4 else 1 + L = src_tensor.shape[-1] + if is_fp4: + assert L % 2 == 0, f"axis dim must be divisible by 2 for e2m1. Got {L}" + out_shape = src_tensor.shape[:-1] + (L // divisor, ) + out_scale_shape = src_tensor.shape[:-1] + (triton.cdiv(L, MXFP_BLOCK_SIZE), ) + + out_quant_tensor = src_tensor.new_empty(out_shape, dtype=out_quant_type) + out_scale = src_tensor.new_empty(out_scale_shape, dtype=torch.uint8) + + if src_tensor.numel() > 0: + kernel_src_tensor = src_tensor.reshape(-1, src_tensor.shape[-1]) + kernel_quant_tensor = out_quant_tensor.view(-1, out_quant_tensor.shape[-1]) + kernel_scale = out_scale.view(-1, out_scale.shape[-1]) + + BLOCK_OUT_DIM = 128 + BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value + grid_out = triton.cdiv(kernel_src_tensor.shape[0], BLOCK_OUT_DIM) + grid_quant = triton.cdiv(kernel_src_tensor.shape[1], BLOCK_QUANT_DIM) + + _downcast_to_mxfp[(grid_out, grid_quant)](kernel_quant_tensor, *kernel_quant_tensor.stride(), kernel_scale, + *kernel_scale.stride(), kernel_src_tensor, *kernel_src_tensor.stride(), + *kernel_src_tensor.shape, BLOCK_OUT_DIM, BLOCK_QUANT_DIM, + DEQUANT_SCALE_ROUNDING_MODE.value, num_warps=8) + + out_quant_tensor = out_quant_tensor.transpose(axis, src_tensor.ndim - 1) + out_scale = out_scale.transpose(axis, src_tensor.ndim - 1) + return out_quant_tensor, out_scale + + +def upcast_from_mxfp(tensor: torch.Tensor, scale: torch.Tensor, target_dtype: torch.dtype, axis: int): + """ + Upcasts an mxfp (packed) weight tensor back to float16 or bfloat16. + + The function assumes that the tensors were quantized along the given axis. + It permutes the tensor so that the quantized axis is last, reshapes to 2D, + launches the Triton upcast kernel, and then unpermutes back to the original order. + """ + ndim = tensor.ndim + assert -ndim <= axis < ndim, f"Invalid axis {axis=}" + axis = axis if axis >= 0 else axis + ndim + assert tensor.ndim == scale.ndim, (f"Weight and scale must have the same number of dimensions. " + f"Got {tensor.ndim=} and {scale.ndim=}") + # dtype checks + assert tensor.dtype in {torch.uint8, torch.float8_e5m2, torch.float8_e4m3fn}, \ + f"Invalid tensor dtype {tensor.dtype=}" + assert scale.dtype == torch.uint8, f"Invalid scale dtype {scale.dtype=}" + assert target_dtype in (torch.float16, torch.bfloat16, torch.float32), f"Invalid output dtype {target_dtype=}" + # upcast + logical_quant_dim = tensor.shape[axis] * (2 if tensor.dtype == torch.uint8 else 1) + tensor = tensor.transpose(axis, tensor.ndim - 1).contiguous() + scale = scale.transpose(axis, scale.ndim - 1).contiguous() + out = torch.empty((*tensor.shape[:-1], logical_quant_dim), dtype=target_dtype, device=tensor.device) + reshaped_out = out.view(-1, out.shape[-1]) + reshaped_tensor = tensor.view(-1, tensor.shape[-1]) + reshaped_scale = scale.view(-1, scale.shape[-1]) + BLOCK_OUT_DIM = 128 + BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value + blocks_out_dim = triton.cdiv(reshaped_out.shape[0], BLOCK_OUT_DIM) + blocks_quant_dim = triton.cdiv(reshaped_out.shape[1], BLOCK_QUANT_DIM) + _upcast_from_mxfp[(blocks_out_dim, blocks_quant_dim)](reshaped_out, *reshaped_out.stride(), reshaped_scale, + *reshaped_scale.stride(), reshaped_tensor, + *reshaped_tensor.stride(), *reshaped_out.shape, BLOCK_OUT_DIM, + BLOCK_QUANT_DIM, num_warps=8) + out = out.transpose(axis, scale.ndim - 1).contiguous() + return out + + +# ------------ + + +def right_shift_unsigned(x, shift): + # CUDA torch does not support bit ops on uint32, so we need to mask to get unsigned right shift + return (x >> shift) & ((1 << (32 - shift)) - 1) + + +def get_max_quant_val(dtype: torch.dtype): + d = {torch.uint8: 6.0, torch.float8_e5m2: 57344.0, torch.float8_e4m3fn: 448.0} + assert dtype in d + return d[dtype] + + +def downcast_to_mxfp_torch(src_tensor: torch.Tensor, out_quant_type: torch.dtype, axis: int, + DEQUANT_SCALE_ROUNDING_MODE: DequantScaleRoundingMode = DequantScaleRoundingMode.ROUND_UP): + """ + Converts the src tensor to the output format specified by out_quant_type. + axis: The axis along which the tensors are contiguous and quantization is applied. + DEQUANT_SCALE_ROUNDING_MODE: 0 for ROUND_UP, 1 for ROUND_DOWN. + + Returns: + out_quant_tensor: Quantized tensor in mx format. + • For mxfp8, the output has the same shape as src_tensor. + • For mxfp4, the size along the axis is halved, and the tensor is returned as a torch.uint8. + scale: Scale tensor (stored as uint8) computed per group of 32 elements along the axis. + Its shape is the same as src_tensor except that the axis is replaced by ceil(L/32), + where L is the original length along that axis. + """ + # This should probably be packed into its own tiny class + ndim = src_tensor.ndim + assert -ndim <= axis < ndim, f"Invalid axis {axis=}" + assert src_tensor.dtype in {torch.float32, torch.bfloat16, + torch.float16}, f"Invalid input tensor dtype {src_tensor.dtype}" + + axis = axis if axis >= 0 else axis + ndim + is_fp4 = out_quant_type == torch.uint8 + is_fp8 = "float8" in str(out_quant_type) + assert is_fp4 or is_fp8, f"Invalid input tensor dtype {out_quant_type}" + + device = src_tensor.device + + # For mxfp4 conversion, we assume the contiguous axis length is even. + if is_fp4: + axis_shape = src_tensor.size(axis) + assert axis_shape % 2 == 0, "For mxfp4 conversion the contiguous axis length must be even." + + # Permute the tensor so that the contiguous axis becomes the last dimension. + src = src_tensor.transpose(axis, src_tensor.ndim - 1).to(torch.float32) + axis_shape = src.shape[-1] + + # Pad the axis to be divisible by 32, in case it is not. + next_multiple = triton.cdiv(axis_shape, MXFP_BLOCK_SIZE) * MXFP_BLOCK_SIZE + pad_amount = next_multiple - axis_shape + padded_src = F.pad(src, (0, pad_amount)) + valid_mask = F.pad(torch.ones_like(src, dtype=torch.bool), (0, pad_amount)) + padded_axis_shape = padded_src.size(-1) # now divisible by 32 + + # --- Compute per-group maximums for scale --- + # Set padded entries to -1 so they don’t affect the max. + abs_f = torch.abs(padded_src) + abs_f = torch.where(valid_mask, abs_f, torch.tensor(-1.0, device=device, dtype=padded_src.dtype)) + # Reshape the last dimension into groups of 32. + new_shape = padded_src.shape[:-1] + (padded_axis_shape // MXFP_BLOCK_SIZE, MXFP_BLOCK_SIZE) + abs_groups = abs_f.view(*new_shape) + # Compute maximum along the group dimension (of size 32). + max_val, _ = abs_groups.max(dim=-1, keepdim=True) + + # Choose a max quantization value depending on type. + max_quant_val = get_max_quant_val(out_quant_type) + dequant_scale = max_val / max_quant_val # shape: (..., padded_axis_shape//32, 1) + + # Convert to int to round the FP32 scale, prior to quantization! + ds_int = dequant_scale.view(torch.int32) + if DEQUANT_SCALE_ROUNDING_MODE == DequantScaleRoundingMode.ROUND_UP: + ds_int_rounded = (ds_int + 0x007FFFFF) & 0x7F800000 + else: + ds_int_rounded = ds_int & 0x7F800000 + # Reinterpret back as float32. + dequant_scale_rounded = ds_int_rounded.view(torch.float32) + + # Compute the quantization scale. + quant_scale = torch.where(dequant_scale_rounded == 0, torch.tensor(0.0, device=device), 1.0 / dequant_scale_rounded) + + # Quantize the tensor + orig_padded_shape = padded_src.shape + padded_src_groups = padded_src.view(*new_shape) + quant_tensor = padded_src_groups * quant_scale + # Reshape back to the original shape and trim padding + quant_tensor = quant_tensor.view(orig_padded_shape) + quant_tensor = quant_tensor[..., :axis_shape] + + # Finally, convert the quantized tensor to the target format + if is_fp8: + # Conversion must use satfinite PTX, so clamp before the conversion in torch to emulate this behavior + quant_tensor = torch.clamp(quant_tensor, -max_quant_val, max_quant_val) + out_weight = quant_tensor.to(out_quant_type) + else: + assert is_fp4, f"Invalid output quantization type {out_quant_type}" + # For mxfp4, perform bit-level manipulation and pack two 4-bit values per uint8. + # First, reinterpret the quantized tensor bits. + q_int = quant_tensor.contiguous().view(torch.int32) + # Extract sign, exponent, and mantissa. + signs = q_int & 0x80000000 + exponents = right_shift_unsigned(q_int, 23) & 0xFF + mantissas = q_int & 0x7FFFFF + + E8_BIAS = 127 + E2_BIAS = 1 + # Adjust mantissas for subnormals. + mantissas = torch.where(exponents < E8_BIAS, (0x400000 | right_shift_unsigned(mantissas, 1)) >> + (E8_BIAS - exponents - 1), mantissas) + exponents = torch.maximum(exponents, torch.tensor(E8_BIAS - E2_BIAS, device=device)) - (E8_BIAS - E2_BIAS) + e2m1_tmp = right_shift_unsigned(((exponents << 2) | right_shift_unsigned(mantissas, 21)) + 1, 1) + e2m1_tmp = torch.minimum(e2m1_tmp, torch.tensor(0x7, device=device)) + e2m1_value = (right_shift_unsigned(signs, 28) | e2m1_tmp).to(torch.uint8) # shape: (..., even_axis_shape) + + # Pack pairs of 4-bit values along the last dimension. + e2m1_value = e2m1_value.view(*e2m1_value.shape[:-1], axis_shape // 2, 2) + evens = e2m1_value[..., 0] + odds = e2m1_value[..., 1] + out_weight = evens | (odds << 4) # shape: (..., axis_shape//2) + + # --- Process and output the scale --- + dq_scale = (ds_int_rounded.view(*dequant_scale.shape) >> 23).to(torch.uint8) # shape: (..., axis_shape//32, 1) + dq_scale = dq_scale.squeeze(-1) + out_weight = out_weight.transpose(axis, src_tensor.ndim - 1) + dq_scale = dq_scale.transpose(axis, src_tensor.ndim - 1) + return out_weight, dq_scale + + +def cvt_e2m1_to_fp32(input_tensor): + assert input_tensor.dtype == torch.uint8 + + input_tensor = input_tensor.to(torch.int32) + evens = input_tensor & 0xF + odds = (input_tensor >> 4) & 0xF + + vals = [0.0, 0.5, 1, 1.5, 2, 3, 4, 6] + outputs = torch.tensor(vals, dtype=torch.float32, device=input_tensor.device) + outputs = torch.cat([outputs, -outputs]) + + even_floats = outputs[evens] + odd_floats = outputs[odds] + output_tensor = torch.stack([even_floats, odd_floats], dim=-1) + output_tensor = output_tensor.view(*input_tensor.shape[:-1], -1) + return output_tensor + + +def upcast_from_mxfp_torch(tensor: torch.Tensor, scale: torch.Tensor, target_dtype: torch.dtype, axis: int): + """ + Converts the mxfp4/mxfp8 tensor to the target format specified by target_dtype. + axis: The axis along which dequantization is applied. + + Returns: + out_weight: Tensor in the target format. + """ + + ndim = tensor.ndim + assert -ndim <= axis < ndim, f"Invalid axis {axis=}" + is_fp8 = tensor.dtype == torch.float8_e4m3fn or tensor.dtype == torch.float8_e5m2 + assert is_fp8 or tensor.dtype == torch.uint8, f"Invalid input quantization type {tensor.dtype}" + + # Permute the tensor and scale so that the quantization axis becomes the last dimension + axis = axis if axis >= 0 else axis + ndim + scale = scale.transpose(axis, scale.ndim - 1) + tensor = tensor.transpose(axis, tensor.ndim - 1) + + dq_scale = (scale.to(torch.int32) << 23).view(torch.float32) # Shift to the exponent and bitcast to fp32 + if tensor.dtype == torch.uint8: + fp32_tensor = cvt_e2m1_to_fp32(tensor) + else: + fp32_tensor = tensor.to(torch.float32) + + logical_quant_dim = tensor.shape[-1] * (2 if tensor.dtype == torch.uint8 else 1) + axis_shape = fp32_tensor.size(-1) + padded_axis_shape = triton.cdiv(logical_quant_dim, MXFP_BLOCK_SIZE) * MXFP_BLOCK_SIZE + pad_size = padded_axis_shape - axis_shape + padded_tensor = F.pad(fp32_tensor, (0, pad_size)) + + new_axis_shape = padded_tensor.shape[-1] + new_shape = padded_tensor.shape[:-1] + (new_axis_shape // MXFP_BLOCK_SIZE, MXFP_BLOCK_SIZE) + padded_tensor = padded_tensor.view(*new_shape) + dq_scale_padded = dq_scale.unsqueeze(-1) # shape: [..., ceil(axis_shape/32), 1] + out_padded = padded_tensor * dq_scale_padded + + # Flatten back and remove the padded tail + out_padded = out_padded.view(*fp32_tensor.shape[:-1], new_axis_shape) + out_tensor = out_padded[..., :axis_shape] + + out_tensor = out_tensor.to(target_dtype).contiguous() + out_tensor = out_tensor.transpose(axis, tensor.ndim - 1) + + return out_tensor + + +quantize_mxfp8_fn = _quantize_mxfp8_fn diff --git a/triton_kernels/numerics_details/mxfp_details/__init__.py b/triton_kernels/numerics_details/mxfp_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/numerics_details/mxfp_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py b/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py new file mode 100644 index 0000000000..616dfd0dac --- /dev/null +++ b/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py @@ -0,0 +1,162 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + +# fmt: off + + +MXFP_BLOCK_SIZE = tl.constexpr(32) + + +@triton.jit +def _get_max_quant_val(dtype: tl.constexpr): + if dtype == tl.uint8: + return 6.0 + elif dtype == tl.float8e5: + return 57344.0 + elif dtype == tl.float8e4nv: + return 448.0 + else: + tl.static_assert(False, f"Invalid {dtype=}") + +@triton.jit +def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.constexpr, + DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr = 0): + is_fp8: tl.constexpr = mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5 + BLOCK_SIZE_OUT_DIM: tl.constexpr = src_tensor.shape[0] + BLOCK_SIZE_QUANT_DIM: tl.constexpr = src_tensor.shape[1] + BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = src_tensor.shape[1] // MXFP_BLOCK_SIZE + + # Explicit cast to fp32 since most ops are not supported on bfloat16. We avoid needless conversions to and from bf16 + f32_tensor = src_tensor.to(tl.float32) + abs_tensor = tl.abs(f32_tensor) + abs_tensor = tl.where(valid_src_mask, abs_tensor, -1.0) # Don't consider padding tensors in scale computation + abs_tensor = tl.reshape(abs_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE]) + max_val = tl.max(abs_tensor, axis=2, keep_dims=True) + dequant_scale = max_val / _get_max_quant_val(mx_tensor_dtype) + if DEQUANT_SCALE_ROUNDING_MODE == 0: + # DequantScaleRoundingMode.ROUND_UP + # compute 2 ** ceil(log2(dequant_scale)) + # Adding 0x007FFFFF adds exponent by 1 unless mantissa is all zeros + # A corner case: exponent is 0xFF that will overflow but that's already + # NaN so assume we don't care. + dequant_scale_exponent = (dequant_scale.to(tl.uint32, bitcast=True) + 0x007FFFFF) & 0x7F800000 + else: + # DequantScaleRoundingMode.ROUND_DOWN + # compute 2 ** floor(log2(dequant_scale)) + assert DEQUANT_SCALE_ROUNDING_MODE == 1 + dequant_scale_exponent = dequant_scale.to(tl.uint32, bitcast=True) & 0x7F800000 + dequant_scale_rounded = dequant_scale_exponent.to(tl.float32, bitcast=True) + quant_scale = tl.where(dequant_scale_rounded == 0, 0, 1.0 / dequant_scale_rounded) + + f32_tensor = tl.reshape(f32_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE]) + quant_tensor = f32_tensor * quant_scale + + # Reshape the tensors after scaling + quant_tensor = quant_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM]) + # Set the invalid portions of the tensor to 0. This will ensure that any padding tensors are 0 in the mx format. + quant_tensor = tl.where(valid_src_mask, quant_tensor, 0) + dequant_scale_exponent = dequant_scale_exponent.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE]) + + # First, we simply extract the exponent part of the scales and store the result + dequant_scale_exponent = (dequant_scale_exponent >> 23).to(tl.uint8) + # Now we must convert the tensors to the mx format. + if is_fp8: + out_tensor = quant_tensor.to(mx_tensor_dtype) + else: + quant_tensor = quant_tensor.to(tl.uint32, bitcast=True) + signs = quant_tensor & 0x80000000 + exponents = (quant_tensor >> 23) & 0xFF + mantissas = (quant_tensor & 0x7FFFFF) + + # 0.25 <= x < 0.75 maps to 0.5, a denormal number + E8_BIAS = 127 + E2_BIAS = 1 + # Move implicit bit 1 at the beginning to mantissa for denormals + adjusted_exponents = tl.core.sub(E8_BIAS, exponents + 1, sanitize_overflow=False) + mantissas = tl.where(exponents < E8_BIAS, (0x400000 | (mantissas >> 1)) >> adjusted_exponents, mantissas) + + # For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0. + exponents = tl.maximum(exponents, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS) + + # Combine sign, exponent, and mantissa, while saturating + # rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right + e2m1_tmp = tl.minimum((((exponents << 2) | (mantissas >> 21)) + 1) >> 1, 0x7) + e2m1_value = ((signs >> 28) | e2m1_tmp).to(tl.uint8) + + e2m1_value = tl.reshape(e2m1_value, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM // 2, 2]) + evens, odds = tl.split(e2m1_value) + out_tensor = evens | (odds << 4) + + return out_tensor, dequant_scale_exponent + +@triton.jit +def _downcast_to_mxfp(mx_tensor_ptr, stride_mxt_outer, stride_mxt_quant: tl.constexpr, + mx_scale_ptr, stride_mx_scale_outer, stride_mx_scale_quant, + src_ptr, stride_src_outer, stride_src_quant, + outer_dim, quant_dim, + BLOCK_SIZE_OUT_DIM: tl.constexpr, BLOCK_SIZE_QUANT_DIM: tl.constexpr, + DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr): + + tl.static_assert(stride_mxt_quant == 1, f"Output stride, {stride_mxt_quant=} must be 1.") + tl.static_assert(BLOCK_SIZE_QUANT_DIM % MXFP_BLOCK_SIZE == 0, f"{BLOCK_SIZE_QUANT_DIM=} must be a multiple of 32") + + # uint8 signifies two fp4 e2m1 values packed into a single byte + mx_tensor_dtype: tl.constexpr = mx_tensor_ptr.dtype.element_ty + tl.static_assert(mx_tensor_dtype == tl.uint8 or (mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5), + f"Invalid {mx_tensor_dtype=}. Must be uint8 or float8.") + + src_dtype: tl.constexpr = src_ptr.dtype.element_ty + tl.static_assert(mx_scale_ptr.dtype.element_ty == tl.uint8, f"{mx_scale_ptr.dtype.element_ty=} must be uint8") + tl.static_assert((src_dtype == tl.bfloat16) or (src_dtype == tl.float16) or (src_dtype == tl.float32), f"{src_dtype=} must be bfloat16 or float16 or float32") + is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8 + + outer_block = tl.program_id(0).to(tl.int64) + quant_block = tl.program_id(1).to(tl.int64) + + K_DIVISOR: tl.constexpr = 2 if is_fp4 else 1 + BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = BLOCK_SIZE_QUANT_DIM // MXFP_BLOCK_SIZE + BLOCK_SIZE_QUANT_MX_TENSOR: tl.constexpr = BLOCK_SIZE_QUANT_DIM // K_DIVISOR + + start_src_quant = quant_block * BLOCK_SIZE_QUANT_DIM + start_mx_scale_quant = quant_block * BLOCK_SIZE_QUANT_MX_SCALE + start_mx_quant = quant_block * BLOCK_SIZE_QUANT_MX_TENSOR + start_out = outer_block * BLOCK_SIZE_OUT_DIM + + src_ptr += start_src_quant * stride_src_quant + start_out * stride_src_outer + mx_scale_ptr += start_mx_scale_quant * stride_mx_scale_quant + start_out * stride_mx_scale_outer + mx_tensor_ptr += start_mx_quant * stride_mxt_quant + start_out * stride_mxt_outer + + offs_src_quant = tl.arange(0, BLOCK_SIZE_QUANT_DIM)[None, :].to(tl.int64) + offs_mxt_quant = tl.arange(0, BLOCK_SIZE_QUANT_MX_TENSOR)[None, :].to(tl.int64) + offs_scale_quant = tl.arange(0, BLOCK_SIZE_QUANT_MX_SCALE)[None, :].to(tl.int64) + offs_outer = tl.arange(0, BLOCK_SIZE_OUT_DIM)[:, None].to(tl.int64) + + mask_src_quant = start_src_quant + offs_src_quant < quant_dim + mask_n = start_out + offs_outer < outer_dim + full_mask_src = mask_src_quant & mask_n + + mask_mxt_quant = start_mx_quant + offs_mxt_quant < tl.cdiv(quant_dim, K_DIVISOR) + full_mask_mxt = mask_mxt_quant & mask_n + + scale_mask_k = start_mx_scale_quant + offs_scale_quant < tl.cdiv(quant_dim, MXFP_BLOCK_SIZE) + full_scale_mask = scale_mask_k & mask_n + + src_tensor_offsets = offs_src_quant * stride_src_quant + offs_outer * stride_src_outer + mx_scale_offsets = offs_scale_quant * stride_mx_scale_quant + offs_outer * stride_mx_scale_outer + mx_tensor_offsets = offs_mxt_quant * stride_mxt_quant + offs_outer * stride_mxt_outer + src_tensor = tl.load(src_ptr + src_tensor_offsets, mask=full_mask_src) + + out_tensor, scale_tensor = _compute_quant_and_scale(src_tensor, full_mask_src, mx_tensor_dtype, + DEQUANT_SCALE_ROUNDING_MODE) + + tl.store(mx_scale_ptr + mx_scale_offsets, scale_tensor, mask=full_scale_mask) + tl.store(mx_tensor_ptr + mx_tensor_offsets, out_tensor, mask=full_mask_mxt) + + +@triton.jit(repr=lambda _: "_dequantize_mxfp8") +def _quantize_mxfp8_fn(input, mask, pid=None): + return _compute_quant_and_scale(input, mask, tl.float8e4nv) diff --git a/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py b/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py new file mode 100644 index 0000000000..58e368972b --- /dev/null +++ b/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py @@ -0,0 +1,129 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + +from ._downcast_to_mxfp import MXFP_BLOCK_SIZE + + +# fmt: off +@triton.jit +def _upcast_from_mxfp(out_ptr, stride_o_outer, stride_o_quant: tl.constexpr, mx_scale_ptr, stride_scale_outer, + stride_scale_quant, mx_tensor_ptr, stride_tensor_outer, stride_tensor_quant: tl.constexpr, + outer_dim, quant_dim, BLOCK_SIZE_OUT_DIM: tl.constexpr, BLOCK_SIZE_QUANT_DIM: tl.constexpr): + + tl.static_assert(stride_o_quant == 1, "the weight must be contiguous in the k dimension for mx") + tl.static_assert(BLOCK_SIZE_QUANT_DIM % MXFP_BLOCK_SIZE == 0, "BLOCK_SIZE_K must be a multiple of 32") + # uint8 signifies two fp4 e2m1 values packed into a single byte + mx_tensor_dtype: tl.constexpr = mx_tensor_ptr.dtype.element_ty + dst_dtype: tl.constexpr = out_ptr.dtype.element_ty + tl.static_assert(dst_dtype == tl.float16 or dst_dtype == tl.bfloat16 or dst_dtype == tl.float32) + tl.static_assert( + mx_tensor_dtype == tl.uint8 + or ((mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5) or mx_tensor_dtype == dst_dtype), + "mx_tensor_ptr must be uint8 or float8 or dst_dtype") + tl.static_assert(mx_scale_ptr.dtype.element_ty == tl.uint8, "mx_scale_ptr must be uint8") + + # Determine if we are dealing with fp8 types. + is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8 + is_fp8: tl.constexpr = mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5 + K_DIVISOR: tl.constexpr = 2 if is_fp4 else 1 + BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = BLOCK_SIZE_QUANT_DIM // MXFP_BLOCK_SIZE + BLOCK_SIZE_QUANT_MX_TENSOR: tl.constexpr = BLOCK_SIZE_QUANT_DIM // K_DIVISOR + + # Compute starting indices for the quantized (packed) dimension and the outer dimension. + outer_block = tl.program_id(0).to(tl.int64) + quant_block = tl.program_id(1).to(tl.int64) + + start_mxt_quant = quant_block * BLOCK_SIZE_QUANT_MX_TENSOR + start_out_quant = quant_block * BLOCK_SIZE_QUANT_DIM + start_mx_scale_quant = quant_block * BLOCK_SIZE_QUANT_MX_SCALE + start_out = outer_block * BLOCK_SIZE_OUT_DIM + + mx_tensor_ptr += start_mxt_quant * stride_tensor_quant + start_out * stride_tensor_outer + mx_scale_ptr += start_mx_scale_quant * stride_scale_quant + start_out * stride_scale_outer + out_ptr += start_out * stride_o_outer + start_out_quant * stride_o_quant + + # Compute offsets and masks. + offs_src_quant = tl.arange(0, BLOCK_SIZE_QUANT_MX_TENSOR)[None, :].to(tl.int64) + offs_out_quant = tl.arange(0, BLOCK_SIZE_QUANT_DIM)[None, :].to(tl.int64) + offs_outer = tl.arange(0, BLOCK_SIZE_OUT_DIM)[:, None].to(tl.int64) + offs_scale = tl.arange(0, BLOCK_SIZE_QUANT_MX_SCALE)[None, :].to(tl.int64) + + mask_outer = start_out + offs_outer < outer_dim + mask_out_quant = start_out_quant + offs_out_quant < quant_dim + full_mask_out = mask_out_quant & mask_outer + + mask_src_quant = start_mxt_quant + offs_src_quant < tl.cdiv(quant_dim, K_DIVISOR) + full_mask_src = mask_src_quant & mask_outer + + mask_scale = start_mx_scale_quant + offs_scale < tl.cdiv(quant_dim, MXFP_BLOCK_SIZE) + full_scale_mask = mask_scale & mask_outer + + tensor_offsets = offs_src_quant * stride_tensor_quant + offs_outer * stride_tensor_outer + scale_offsets = offs_scale * stride_scale_quant + offs_outer * stride_scale_outer + out_offsets = offs_out_quant * stride_o_quant + offs_outer * stride_o_outer + + # Load the packed tensor and scale. + tensor = tl.load(mx_tensor_ptr + tensor_offsets, mask=full_mask_src) + scale = tl.load(mx_scale_ptr + scale_offsets, mask=full_scale_mask) + + # Upcast the scale to the destination type. + if dst_dtype == tl.bfloat16: + dst_scale = (scale.to(tl.uint16) << 7).to(dst_dtype, bitcast=True) + else: + dst_scale = (scale.to(tl.uint32) << 23).to(tl.float32, bitcast=True) + if dst_dtype == tl.float16: + dst_scale = dst_scale.to(tl.float16) + + # Now upcast the tensor. + intermediate_dtype: tl.constexpr = tl.bfloat16 if dst_dtype == tl.float32 else dst_dtype + if is_fp8: + dst_tensor = tensor.to(intermediate_dtype) + if tensor.dtype == tl.float8e5: + from_e_bits: tl.constexpr = 5 + from_m_bits: tl.constexpr = 2 + to_e_bits: tl.constexpr = 8 if intermediate_dtype == tl.bfloat16 else 5 + to_m_bits: tl.constexpr = 7 if intermediate_dtype == tl.bfloat16 else 10 + + # Preserve infs and nans. FIXME Fp8E5M2_to_Bf16 doesn't preserve them! + non_finite_mask_src: tl.constexpr = ((1 << from_e_bits) - 1) << from_m_bits + non_finite_mask_dst: tl.constexpr = ((1 << to_e_bits) - 1) << to_m_bits + dst_tensor = tl.where( + (tensor.to(tl.uint8, bitcast=True) & non_finite_mask_src) == non_finite_mask_src, + (dst_tensor.to(tl.uint16, bitcast=True) | non_finite_mask_dst).to(intermediate_dtype, bitcast=True), + dst_tensor, + ) + else: + assert is_fp4 + dst_bias: tl.constexpr = 127 if intermediate_dtype == tl.bfloat16 else 15 + dst_0p5: tl.constexpr = 16128 if intermediate_dtype == tl.bfloat16 else 0x3800 + dst_m_bits: tl.constexpr = 7 if intermediate_dtype == tl.bfloat16 else 10 + # e2m1 + em0 = tensor & 0x07 + em1 = tensor & 0x70 + x0 = (em0.to(tl.uint16) << (dst_m_bits - 1)) | ((tensor & 0x08).to(tl.uint16) << 12) + x1 = (em1.to(tl.uint16) << (dst_m_bits - 5)) | ((tensor & 0x80).to(tl.uint16) << 8) + # Three cases: + # 1) x is normal and non-zero: Correct bias + x0 = tl.where((em0 & 0x06) != 0, x0 + ((dst_bias - 1) << dst_m_bits), x0) + x1 = tl.where((em1 & 0x60) != 0, x1 + ((dst_bias - 1) << dst_m_bits), x1) + # 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type + x0 = tl.where(em0 == 0x01, dst_0p5 | (x0 & 0x8000), x0) + x1 = tl.where(em1 == 0x10, dst_0p5 | (x1 & 0x8000), x1) + # 3) x is zero, do nothing + dst_tensor = tl.interleave(x0, x1).to(intermediate_dtype, bitcast=True) + dst_tensor = dst_tensor.to(dst_dtype) + + # Reshape for proper broadcasting: the scale was stored with a 32‐sized “inner” grouping. + dst_tensor = dst_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE]) + dst_scale = dst_scale.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1]) + scale = scale.reshape(dst_scale.shape) + + out_tensor = dst_tensor * dst_scale + # Correct any NaNs encoded via the scale. + out_tensor = tl.where(scale == 0xFF, float("nan"), out_tensor) + out_tensor = out_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM]) + tl.store(out_ptr + out_offsets, out_tensor, mask=full_mask_out) diff --git a/triton_kernels/proton_opts.py b/triton_kernels/proton_opts.py new file mode 100644 index 0000000000..befa2f6ce2 --- /dev/null +++ b/triton_kernels/proton_opts.py @@ -0,0 +1,21 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/proton_opts.py +# Triton is licensed under the MIT License. + +# proton options + +import os + +_launch_metadata_allow_sync = None + + +def launch_metadata_allow_sync(): + global _launch_metadata_allow_sync + if _launch_metadata_allow_sync is None: + _launch_metadata_allow_sync = not (os.getenv("PROTON_LAUNCH_METADATA_NOSYNC") == "1") + return _launch_metadata_allow_sync + + +def set_launch_metadata_allow_sync(allow_sync: bool): + global _launch_metadata_allow_sync + _launch_metadata_allow_sync = allow_sync diff --git a/triton_kernels/reduction_details/__init__.py b/triton_kernels/reduction_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/reduction_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/reduction_details/reduce_bitmatrix.py b/triton_kernels/reduction_details/reduce_bitmatrix.py new file mode 100644 index 0000000000..4f84e73ac0 --- /dev/null +++ b/triton_kernels/reduction_details/reduce_bitmatrix.py @@ -0,0 +1,115 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/reduction_details/reduce_bitmatrix.py +# Triton is licensed under the MIT License. + +import torch +import triton +import triton.language as tl + + +@triton.jit +def vpopc(x): + """ + Vertical popcount + Input x : uint32[..., N] + Output y : uint32[..., 32] + semantics : y[..., i] = sum_j((x[..., j] >> i) & 1) + credits: @apgoucher + """ + + tl.static_assert(x.dtype == tl.uint32, "x should consist of 32-bit unsigned integers") + + BLOCK_N: tl.constexpr = x.shape[-1] # summation axis + BATCHES: tl.constexpr = x.numel // BLOCK_N # number of batches + if BLOCK_N >= 8: + sa1: tl.constexpr = 8 + else: + sa1: tl.constexpr = BLOCK_N + # create 8-way sums in 4-bit fields: + y = tl.reshape(x, [BATCHES, BLOCK_N // sa1, sa1, 1]) + y = (y >> tl.arange(0, 4)[None, None, None, :]) & 0x11111111 + y = tl.sum(y, 2) # [BATCHES, BLOCK_N // sa1, 4] + if BLOCK_N >= 128: + sa2: tl.constexpr = 16 + else: + sa2: tl.constexpr = BLOCK_N // sa1 + # create 128-way sums in 8-bit fields: + y = tl.reshape(y, [BATCHES, BLOCK_N // (sa1 * sa2), sa2, 1, 4]) + y = (y >> (4 * tl.arange(0, 2))[None, None, None, :, None]) & 0x0f0f0f0f + y = tl.sum(y, 2) # [BATCHES, BLOCK_N // (sa1 * sa2), 2, 4] + sa3: tl.constexpr = BLOCK_N // (sa1 * sa2) + # create N-way sums in 32-bit fields: + y = tl.reshape(y, [BATCHES, 1, sa3, 8]) + y = (y >> (8 * tl.arange(0, 4))[None, :, None, None]) & 0x000000ff + y = tl.sum(y, 2) # [BATCHES, 4, 8] + y = tl.reshape(y, x.shape[:-1] + [32]) + return y + + +@triton.jit +def _sum_bitmatrix_memset(Ret, BLOCK: tl.constexpr): + pid = tl.program_id(0) + offs = pid * BLOCK + tl.arange(0, BLOCK) + tl.store(Ret + offs, 0) + + +@triton.jit +def _sum_bitmatrix_rows(B, shape_bm, stride_bm: tl.constexpr, stride_bn: tl.constexpr, # input bitmatrix + Ret, Partials, stride_pm: tl.constexpr, stride_pn, shape_pn, # outputs + BLOCK_MM: tl.constexpr, BLOCK_M: tl.constexpr): + + tl.static_assert(BLOCK_MM % BLOCK_M == 0) + TILE_SIZE: tl.constexpr = BLOCK_MM // BLOCK_M + if isinstance(shape_bm, tl.tensor) and shape_bm.dtype.is_ptr(): + shape_bm = tl.load(shape_bm) + pid_m = tl.program_id(0) + pid_n = tl.program_id(1) + offs_m = pid_m * BLOCK_MM + tl.arange(0, BLOCK_MM) + offs_n = pid_n * 32 + tl.arange(0, 32) + n_rows = shape_bm + bits = tl.load(B + pid_n * stride_bn + offs_m * stride_bm, mask=offs_m < n_rows, other=0) + bits = tl.reshape(bits, [TILE_SIZE, BLOCK_M]) + ret = vpopc(bits) # [TILE_SIZE, 32] + + offs_t = pid_m * TILE_SIZE + tl.arange(0, TILE_SIZE) + + tl.atomic_add(Ret + offs_n, tl.sum(ret, 0), sem="relaxed") + tl.store(Partials + offs_t[:, None] * stride_pm + offs_n[None, :] * stride_pn, ret) + + +def clear_sums(n_cols, device, MEMSET_BLOCK=512): + cdiv = triton.cdiv + blocks = cdiv(n_cols, MEMSET_BLOCK) + out_ret = torch.empty((blocks * MEMSET_BLOCK, ), device=device, dtype=torch.int32) + _sum_bitmatrix_memset[(blocks, )](out_ret, MEMSET_BLOCK) + return out_ret + + +def sum_bitmatrix_rows(x, out_ret, partials_block_size=None): + assert partials_block_size is not None + cdiv = triton.cdiv + PARTIALS_BLOCK_M = partials_block_size + n_rows, n_cols = x.shape + n_rows_max = x.shape_max[0] + assert out_ret.shape == (n_cols, ) + + TILE_SIZE = max(1, 128 // PARTIALS_BLOCK_M) + BLOCK_MM = PARTIALS_BLOCK_M * TILE_SIZE + + pids_x = cdiv(n_rows_max, BLOCK_MM) + pids_y = cdiv(n_cols, 32) + out_partials = torch.empty((pids_y * 32, pids_x * TILE_SIZE), device=out_ret.device, dtype=torch.int32) + out_partials = torch.transpose(out_partials, 0, 1) + + # output tensors + _sum_bitmatrix_rows[(pids_x, pids_y)]( + x.storage.data, n_rows, x.stride(0), x.stride(1), # input + out_ret, # output [final reduction] + out_partials, out_partials.stride(0), out_partials.stride(1), + out_partials.shape[1], # output [partial reductions] + BLOCK_M=PARTIALS_BLOCK_M, BLOCK_MM=BLOCK_MM, # constants + num_warps=8) + + out_partials = out_partials[:cdiv(n_rows_max, PARTIALS_BLOCK_M), :] + + return out_ret, out_partials diff --git a/triton_kernels/routing.py b/triton_kernels/routing.py new file mode 100644 index 0000000000..f59a6a001a --- /dev/null +++ b/triton_kernels/routing.py @@ -0,0 +1,396 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/routing.py +# Triton is licensed under the MIT License. + +import torch +import triton +from dataclasses import dataclass, field +from .routing_details._routing_compute import _combined_routing_compute +from .routing_details._routing_compute import _combined_routing_memset +from .routing_details._routing_compute import _routing_clear_bitmatrix +from .routing_details._expt_data import _expt_data_memset +from .routing_details._expt_data import _expt_data_compute +from .target_info import is_hip + + +@dataclass +class GatherIndx: + """ + Indices for an operation that performs: + Y = X[src_idx, :] + """ + # array such that `dst_idx[src_idx] = arange(0, N)` + src_indx: torch.Tensor + dst_indx: torch.Tensor + + +@dataclass +class ScatterIndx: + """ + Indices for an operation that performs: + Y[dst_idx, :] = X + """ + # array such that `dst_idx[src_idx] = arange(0, N)` + src_indx: torch.Tensor + dst_indx: torch.Tensor + + +@dataclass +class ExptData: + # hist[i] is the number of tokens routed to expert i + hist: torch.Tensor + # token_offs_raw[i] is the offset of the first token routed + # to expert i in an expert-sorted array + token_offs_raw: torch.Tensor + # token_offs_pad[block][i] is the offset of the first token routed + # to expert i in an expert-sorted array, assuming histogram + # rounded to the next multiple of `block` + token_offs_pad: dict[int, torch.Tensor] + # block_id_map[block] contain one value for each `pid`` launched by + # the matrix multiplication kernel launched with BLOCK_M=block: + # - the value is -1 if the `pid` has no work to do + # - otherwise, the value is two int16 (packed as an int32) that + # correspond respectively to (1) the expert assigned to + # the tokens processed by this pid; (2) the block assigned to the + # tokens processed by this pid (think `pid_m` in a regular matmul) + # see `test_routing.py` for a reference implementation and more details + block_pid_map: dict[int, torch.Tensor] + + def __post_init__(self): + if self.hist is not None: + assert self.hist.dtype == torch.int32 + if self.token_offs_raw is not None: + assert self.token_offs_raw.dtype == torch.int32 + if self.token_offs_pad is not None: + for v in self.token_offs_pad.values(): + assert v.dtype == torch.int32 + if self.block_pid_map is not None: + for v in self.block_pid_map.values(): + assert v.dtype == torch.int32 + + +@dataclass +class RoutingData: + gate_scal: torch.Tensor = field() + expt_hist: torch.Tensor = field() + n_expts_tot: int = field() + n_expts_act: int = field() + expt_data: ExptData = None + + # Used to make perf annotation cleaner: when we use expert sharding, we can + # use this to tell the "expected" number of local tokens per expert, because + # the actual number can vary per each input. + expected_tokens_per_expt: int = field(default=None) + + def n_blocks(self, n_rows, block_m): + if n_rows <= self.n_expts_tot: + return n_rows + else: + return triton.cdiv(max(n_rows - self.n_expts_tot + 1, 0), block_m) + self.n_expts_tot - 1 + + +# -------------------------- +# sort tokens by expert +# -------------------------- + + +class SortTokens(torch.autograd.Function): + + @staticmethod + def forward(ctx, expt_scal, expt_indx, n_expts_tot, bitmatrix): + HIST_BLOCK_M = 32 + INDX_OFFS_BLOCK_M = 512 + MEMSET_BLOCK = 1024 + cdiv = triton.cdiv + + device = expt_scal.device + dtype = expt_scal.dtype + n_tokens_raw, _ = bitmatrix.shape + n_tokens_pad, n_expts_act = expt_scal.shape + n_gates_pad = n_tokens_pad * n_expts_act + + hist, partial_hist = bitmatrix.sum(partials_block_size=HIST_BLOCK_M) + hist = hist[:n_expts_tot] + assert hist.dtype == torch.int32 + # scratchpad + expt_offs = torch.empty(n_expts_tot, dtype=torch.int32, device=device) + combined_indx = torch.empty(n_gates_pad * 2, dtype=torch.int32, device=device) + # output + topk_indx = combined_indx[:n_gates_pad] + gate_indx = combined_indx[n_gates_pad:] + gate_scal = torch.empty(n_gates_pad, dtype=dtype, device=device) + + token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1a, blocks2a, MEMSET_BLOCK_A, HIST2_BLOCK_M, block_m_log2_start, block_m_num = _compute_expt_data_internal( + hist, n_expts_tot, n_gates_pad) + + blocks1b = cdiv(n_gates_pad * 2, MEMSET_BLOCK) + n_expts_tot + 1 + blocks2b = cdiv(n_tokens_pad, HIST_BLOCK_M) + + _combined_routing_memset[(blocks1a + blocks1b, )]( + combined_indx, n_gates_pad * 2, -1, MEMSET_BLOCK, hist, # + expt_offs, hist.shape[0], n_expts_tot, partial_hist, # inputs + partial_hist.shape[0], partial_hist.stride(0), partial_hist.stride(1), # outputs + token_offs_combined, token_offs_combined.stride(0), # + blocks1a, block_pid_map, # + block_m_log2_start, SIZES=block_m_num, BLOCK_A=MEMSET_BLOCK_A, # optimization parameters + BLOCK_N=512, BLOCK_M=INDX_OFFS_BLOCK_M, # tunable parameters + ) + + indx_offs = partial_hist + + _combined_routing_compute[(blocks2a + blocks2b, )]( + topk_indx, gate_indx, gate_scal, # outputs + expt_scal, expt_indx, indx_offs, indx_offs.stride(0), indx_offs.stride(1), # inputs + expt_offs, n_tokens_raw, # input shape + HIST_BLOCK_M, n_expts_act, # constants + hist, token_offs_pad, token_offs_pad.stride(0), block_pid_map, block_pid_map.stride(0), # outputs + block_m_log2_start, block_m_num, HIST2_BLOCK_M, blocks2a, # etc. + ) + + ctx.n_tokens_raw = n_tokens_raw + ctx.n_tokens_pad = n_tokens_pad + ctx.n_expts_act = n_expts_act + ctx.save_for_backward(gate_indx) + return hist, topk_indx, gate_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map + + @staticmethod + def backward(ctx, _0, _1, _2, dgate_scal, _3, _4, _5): + (gate_indx, ) = ctx.saved_tensors + dgate_scal = dgate_scal[gate_indx] + dgate_scal = dgate_scal.reshape(ctx.n_tokens_pad, ctx.n_expts_act) + return dgate_scal, None, None, None + + +def sort_tokens(expt_scal, expt_indx, n_expts_tot, bitmatrix): + return SortTokens.apply(expt_scal, expt_indx, n_expts_tot, bitmatrix) + + +# -------------------------- +# prune routing +# -------------------------- + + +class PruneRouting(torch.autograd.Function): + + @staticmethod + def forward(ctx, expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep): + from .compaction import compaction + n_tokens_pad = expt_scal.shape[0] + assert n_expts_tot % simulated_ep == 0 + _routing_clear_bitmatrix[(n_tokens_pad, )]( + bitmatrix.storage.data, + bitmatrix.storage.data.stride(0), + bitmatrix.storage.data.stride(1), + bitmatrix.storage.data.shape[1], + n_expts_tot // simulated_ep, + BLOCK_N=512, + ) + # perform compaction to update expt_scal / expt_indx + expt_scal, expt_indx = compaction(expt_scal, expt_indx, bitmatrix) + n_expts_tot = n_expts_tot // simulated_ep + bitmatrix.shape[-1] = n_expts_tot + return expt_scal, expt_indx, bitmatrix + + +def prune_routing(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep): + return PruneRouting.apply(expt_scal, expt_indx, bitmatrix, n_expts_tot, simulated_ep) + + +# -------------------------- +# expt_data +# -------------------------- + + +def log2_power_of_two(x): + assert x > 0 and (x & (x - 1)) == 0, "x must be a power of two" + return x.bit_length() - 1 + + +block_m_log2_start = 4 + + +def _compute_expt_data_internal(expt_hist, n_expts_tot, n_gates): + + MEMSET_BLOCK = 512 + HIST2_BLOCK_M = 512 + device = expt_hist.device + n_expts_tot = n_expts_tot + cdiv = triton.cdiv + # block_ms are all powers-of-two between 16 and 128 (inclusive) + block_m_log2_end = 9 if is_hip() else 8 + block_m_num = block_m_log2_end - block_m_log2_start + if n_gates <= n_expts_tot: + max_n_tiles = n_gates + else: + max_n_tiles = n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // 2**block_m_log2_start) + # allocate memory + pad = lambda x: cdiv(x, MEMSET_BLOCK) * MEMSET_BLOCK + dtype = torch.int32 + + token_offs_combined = torch.empty((block_m_num + 1, pad(n_expts_tot + 1)), dtype=dtype, device=device) + + token_offs_raw = token_offs_combined[0][:n_expts_tot + 1] + token_offs_pad = token_offs_combined[1:] + + block_pid_map = torch.empty((block_m_num, pad(max_n_tiles)), dtype=dtype, device=device) + memset_grid = torch.numel(block_pid_map) // MEMSET_BLOCK # exact division + # compute outputs + token_offs_pad = token_offs_pad[:, :n_expts_tot + 1] + block_pid_map = block_pid_map[:, :max_n_tiles] + + blocks1 = memset_grid + block_m_num + 1 + blocks2 = n_expts_tot * block_m_num + return token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1, blocks2, MEMSET_BLOCK, HIST2_BLOCK_M, block_m_log2_start, block_m_num + + +def _unpack_into_dict(x): + + block_m_log2_end = block_m_log2_start + x.shape[0] + x = {2**j: x[i, :] for i, j in enumerate(range(block_m_log2_start, block_m_log2_end))} + return x + + +def compute_expt_data(expt_hist, n_expts_tot, n_gates): + + if expt_hist is None: + return ExptData(None, None, None, None) + + # this just computes the kernel arguments: + token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1, blocks2, MEMSET_BLOCK, HIST2_BLOCK_M, block_m_log2_start, block_m_num = _compute_expt_data_internal( + expt_hist, n_expts_tot, n_gates) + + _expt_data_memset[(blocks1, )]( + expt_hist, n_expts_tot, # + token_offs_combined, token_offs_combined.stride(0), # + block_pid_map, # + block_m_log2_start, SIZES=block_m_num, BLOCK=MEMSET_BLOCK, # optimization parameters + num_warps=4) + _expt_data_compute[(blocks2, )]( + expt_hist, token_offs_pad, token_offs_pad.stride(0), block_pid_map, block_pid_map.stride(0), # outputs + block_m_log2_start, SIZES=block_m_num, BLOCK=HIST2_BLOCK_M, # optimization parameters + num_warps=4) + + token_offs_pad = _unpack_into_dict(token_offs_pad) + block_pid_map = _unpack_into_dict(block_pid_map) + return ExptData(expt_hist, token_offs_raw, token_offs_pad, block_pid_map) + + +# -------------------------- +# routing +# -------------------------- + + +def routing_from_bitmatrix(bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act): + hist, topk_indx, gate_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map = sort_tokens( + expt_scal, expt_indx, n_expts_tot, bitmatrix) + token_offs_pad = _unpack_into_dict(token_offs_pad) + block_pid_map = _unpack_into_dict(block_pid_map) + expt_data = ExptData(hist, token_offs_raw, token_offs_pad, block_pid_map) + + # pack the matmul data structure + gather_indx = GatherIndx(src_indx=topk_indx, dst_indx=gate_indx) + scatter_indx = ScatterIndx(src_indx=gate_indx, dst_indx=topk_indx) + return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act, expt_data), gather_indx, scatter_indx + + +def routing(logits, n_expts_act, sm_first=False, expt_indx=None, simulated_ep=1, n_rows=None): + from .topk import topk + if sm_first: + logits = torch.softmax(logits, dim=-1) + expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, # + apply_softmax=not sm_first, y_indx=expt_indx, n_rows=n_rows) + n_expts_tot = logits.shape[-1] // simulated_ep + # mutate bitmatrix + if simulated_ep > 1: + expt_scal, expt_indx, bitmatrix = prune_routing(expt_scal, expt_indx, bitmatrix, logits.shape[-1], simulated_ep) + + return routing_from_bitmatrix(bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act) + + +# -------------------------- +# torch reference +# -------------------------- + + +def compute_expt_data_torch(hist, n_expts_tot, n_gates): + # offset for each experts + device = hist.device + token_offs_raw = torch.cumsum(hist, dim=0) + token_offs_raw = torch.cat((torch.zeros(1, device=device), token_offs_raw)) + token_offs_raw = token_offs_raw.int() + # maximum number of tiles for all values of `block_m` considered + block_ms = [16, 32, 64, 128] + if is_hip(): + block_ms.append(256) + if n_gates <= n_expts_tot: + max_n_tiles = n_gates + else: + # ceil_div(n_gates - n_experts + 1, d_tile) + n_experts - 1 + # ceil_div(x, y): -(-x // y) + max_n_tiles = n_expts_tot - 1 - ((n_expts_tot - n_gates - 1) // min(block_ms)) + # fill up tile offset/infos for each block + token_offs_pad = dict() + block_pid_map = dict() + for block_m in block_ms: + n_tiles = (hist + block_m - 1) // block_m # matmul blocks needed + token_offs_pad[block_m] = torch.cumsum(n_tiles, dim=0) + token_offs_pad[block_m] = torch.cat((torch.zeros(1, device=device), token_offs_pad[block_m])) + token_offs_pad[block_m] = token_offs_pad[block_m].int() + # compute data required to drive ragged batch matmul + block_pid_map[block_m] = -torch.ones(max_n_tiles, dtype=torch.int32, device=device) + + # for e in range(n_expts_tot): + # offset = token_offs_pad[block_m][e] + # for b in range(n_tiles[e]): + # block_pid_map[block_m][offset + b] = (b << 16) + e + + col = torch.arange(max_n_tiles, device=device) + map_vals = torch.arange(n_expts_tot, device=device)[:, None] + (col << 16)[None, :] + map_idxs = token_offs_pad[block_m][:-1, None] + col[None, :] + mask = col[None, :] < n_tiles[:, None] + block_pid_map[block_m].index_put_((map_idxs[mask], ), map_vals.int()[mask]) + return ExptData(hist, token_offs_raw, token_offs_pad, block_pid_map) + + +def topk_torch(vals, k, expt_indx, has_user_provided_indx=False): + # topk of experts + if has_user_provided_indx: + tk_indx = expt_indx + else: + tk_indx = torch.argsort(-vals, dim=1, stable=True)[:, :k] + tk_indx = tk_indx.long() + tk_val = torch.take_along_dim(vals, tk_indx, dim=1) + tk_indx = tk_indx.int() + return tk_val, tk_indx + + +def routing_torch(logits, n_expts_act, sm_first=False, expt_indx=None, n_rows=None): + has_user_provided_indx = expt_indx is not None + n_gates_pad = logits.shape[0] * n_expts_act + + if n_rows is not None: + logits = logits[:n_rows, :] + _, n_expts_tot = logits.shape + if sm_first: + logits = torch.softmax(logits, dim=-1) + expt_scal, expt_indx = topk_torch(logits, n_expts_act, expt_indx, has_user_provided_indx=has_user_provided_indx) + if not sm_first: + expt_scal = torch.softmax(expt_scal, dim=-1) + # sort each token's selections by expert + if not has_user_provided_indx: + expt_indx, sort_indices = torch.sort(expt_indx, dim=1) + expt_scal = torch.gather(expt_scal, 1, sort_indices) + # flatten topk data + expt_scal = expt_scal.reshape(-1) + expt_indx = expt_indx.reshape(-1).to(torch.int32) + # sort by expert_id so experts are contiguous for the matmul + topk_indx = torch.argsort(expt_indx, stable=True) + gate_indx = torch.argsort(topk_indx, stable=True) + gate_scal = expt_scal[topk_indx] + hist = torch.histc(expt_indx, bins=n_expts_tot, max=n_expts_tot - 1).int() # histogram of tokens over experts + # pack the matmul data structure + gather_indx = GatherIndx(src_indx=topk_indx.int(), dst_indx=gate_indx.int()) + scatter_indx = ScatterIndx(src_indx=gate_indx.int(), dst_indx=topk_indx.int()) + # compute expt_data + expt_data = compute_expt_data_torch(hist, n_expts_tot, n_gates_pad) + return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act, expt_data), gather_indx, scatter_indx diff --git a/triton_kernels/routing_details/__init__.py b/triton_kernels/routing_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/routing_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/routing_details/_expt_data.py b/triton_kernels/routing_details/_expt_data.py new file mode 100644 index 0000000000..d5b736a374 --- /dev/null +++ b/triton_kernels/routing_details/_expt_data.py @@ -0,0 +1,68 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/routing_details/_expt_data.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + + +@triton.jit +def _cdiv_pow2(n, log2_k): + return (n + ((1 << log2_k) - 1)) >> log2_k + + +@triton.jit +def _expt_data_memset(Hist, n_expts_tot, MDStarts, tile_starts_stridem, MDTileInfo, first_tile_dim_log2, + SIZES: tl.constexpr, BLOCK: tl.constexpr): + + pid = tl.program_id(0) + + if pid <= SIZES: + + MDStarts += pid * tile_starts_stridem + x_tile = tl.zeros([BLOCK], dtype=MDStarts.dtype.element_ty) + Tile_ptrs = MDStarts + tl.arange(0, BLOCK) + tile_dim_log2 = tl.where(pid == 0, 0, pid + first_tile_dim_log2 - 1) + + for i in range(0, n_expts_tot + 1, BLOCK): + + offs_n = tl.arange(0, BLOCK) + i + mask_n0 = offs_n < n_expts_tot + hist_tok = tl.load(Hist + offs_n, mask=mask_n0, other=0) + hist_tile = _cdiv_pow2(hist_tok, tile_dim_log2) + + tile_starts = tl.cumsum(hist_tile, 0) + x_tile + x_tile += tl.sum(hist_tile, 0).to(MDStarts.dtype.element_ty) + tl.store(Tile_ptrs, tile_starts - hist_tile) + Tile_ptrs += BLOCK + + else: + + pid -= (SIZES + 1) + TileInfoOut = MDTileInfo + pid * BLOCK + tl.arange(0, BLOCK) + tl.store(TileInfoOut, 0xffffffff) + + +@triton.jit +def _expt_data_compute(Hist, MDTileStarts, tile_starts_stridem, MDTileInfo, tile_info_stridem, first_tile_dim_log2, + SIZES: tl.constexpr, BLOCK: tl.constexpr): + + pid = tl.program_id(0) + + expt_id = pid // SIZES + buff_id = pid % SIZES + + MDTileStarts += buff_id * tile_starts_stridem + MDTileInfo += buff_id * tile_info_stridem + + n_tokens = tl.load(Hist + expt_id) + tile_dim_log2 = first_tile_dim_log2 + buff_id + n_blocks = _cdiv_pow2(n_tokens, tile_dim_log2) + + tile_off = tl.load(MDTileStarts + expt_id) + MDTileInfo += tile_off + + for block_off in range(0, n_blocks, BLOCK): + block_offs = block_off + tl.arange(0, BLOCK) + data = (block_offs << 16) + expt_id + tl.store(MDTileInfo + block_offs, data, mask=block_offs < n_blocks) diff --git a/triton_kernels/routing_details/_routing_compute.py b/triton_kernels/routing_details/_routing_compute.py new file mode 100644 index 0000000000..595d842878 --- /dev/null +++ b/triton_kernels/routing_details/_routing_compute.py @@ -0,0 +1,152 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + +from ._expt_data import _expt_data_compute, _expt_data_memset + + +@triton.jit +def _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size, # histogram + BLOCK_N: tl.constexpr): + loop_iterations = (hist_size + BLOCK_N - 1) // BLOCK_N + x = tl.zeros([BLOCK_N], ExpertHist.dtype.element_ty) + for i in range(loop_iterations): + offs_n = i * BLOCK_N + tl.arange(0, BLOCK_N) + mask_n = offs_n < hist_size + hist2 = tl.load(ExpertHist + offs_n, mask=mask_n) + tok_starts = tl.cumsum(hist2, 0) - hist2 + x + x += tl.sum(hist2, 0) + tl.store(FinalExpertOffs + offs_n, tok_starts, mask=mask_n) + offs_n += BLOCK_N + + +@triton.jit +def _routing_compute_indx_offs(PartialHist, shape_pm, stride_pm, stride_pn, BLOCK_M: tl.constexpr, expt_id): + offs_m = tl.arange(0, BLOCK_M) + # iterate over input data + curr_sum = 0 + for _ in range(0, shape_pm, BLOCK_M): + offs = offs_m * stride_pm + expt_id * stride_pn + curr = tl.load(PartialHist + offs, mask=offs_m < shape_pm) + out = tl.cumsum(curr, 0) + curr_sum + curr_sum += tl.sum(curr, 0) + tl.store(PartialHist + offs, out - curr, mask=offs_m < shape_pm) + offs_m += BLOCK_M + + +@triton.jit +def _keyed_add(x, y): + + # we keep the key in the upper 16 bits of a uint32: + key_mask: tl.constexpr = 0xffff0000 + + kx = x & key_mask + ky = y & key_mask + z = tl.where(kx == ky, x + y - kx, y) + return z + + +@triton.jit +def _routing_compute_indx(pid_m, GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, + stride_pn, TokensStart, n_tokens, BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr): + + if isinstance(n_tokens, tl.tensor) and n_tokens.dtype.is_ptr(): + n_tokens = tl.load(n_tokens) + n_gates = n_tokens * N_EXPTS_ACT + + tl.static_assert(N_EXPTS_ACT * BLOCK_M <= 32768) + + local_offs = tl.arange(0, N_EXPTS_ACT * BLOCK_M) + offs = pid_m * BLOCK_M * N_EXPTS_ACT + local_offs + expert = tl.load(ExptIndx + offs, mask=(offs < n_gates), other=-1).to(tl.uint32) + + # stable-sort by expert ID: + kv_pairs = ((expert << 16) | local_offs).to(tl.uint32) + kv_pairs = tl.sort(kv_pairs, 0) + expert = kv_pairs >> 16 + offs = pid_m * BLOCK_M * N_EXPTS_ACT + (kv_pairs & 0xffff) + mask = expert != 0xffff + gate_scal = tl.load(ExptScal + offs, mask=mask) + + # compute run lengths in expert-sorted order: + x = (kv_pairs & 0xffff0000 | 0x00000001) + expts_and_inclusive_run_lengths = tl.associative_scan(x, 0, _keyed_add) + exclusive_run_lengths = (expts_and_inclusive_run_lengths - 1) & 0xffff + + gates = tl.load(PartialOffs + pid_m * stride_pm + expert * stride_pn, mask=mask) + gates += tl.load(TokensStart + expert, mask=mask) + gates += exclusive_run_lengths + + tl.store(ScatterIndx + offs, gates, mask=mask) + tl.store(GatherIndx + gates, offs, mask=mask) + tl.store(GateScal + gates, gate_scal, mask=mask) + + +@triton.jit +def _combined_routing_compute(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, stride_pn, + TokensStart, n_tokens, BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr, Hist, + MDTileStarts, tile_starts_stridem, MDTileInfo, tile_info_stridem, first_tile_dim_log2, + SIZES: tl.constexpr, BLOCK: tl.constexpr, blocks2a): + + pid = tl.program_id(0) + if pid < blocks2a: + _expt_data_compute(Hist, MDTileStarts, tile_starts_stridem, MDTileInfo, tile_info_stridem, first_tile_dim_log2, + SIZES, BLOCK) + else: + pid -= blocks2a + _routing_compute_indx(pid, GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, + stride_pn, TokensStart, n_tokens, BLOCK_M, N_EXPTS_ACT) + + +@triton.jit +def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff, BLOCK_N: tl.constexpr): + pid_m = tl.program_id(0) + cutoff_word = cutoff // 32 + cutoff_bit = cutoff % 32 + cutoff_mask = (1 << (cutoff_bit)) - 1 + for start_n in range(0, shape_bn, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + values = tl.load(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, mask=offs_n < shape_bn) + values = tl.where(offs_n == cutoff_word, values & cutoff_mask, values) + values = tl.where(offs_n > cutoff_word, 0, values) + tl.store(Bitmatrix + pid_m * stride_bm + offs_n * stride_bn, values, mask=offs_n < shape_bn) + + +@triton.jit +def _combined_routing_memset(Indx, size, sentinel, BLOCK: tl.constexpr, ExpertHist, FinalExpertOffs, hist_size, + n_expts_tot, PartialHist, shape_pm, stride_pm, stride_pn, MDStarts, tile_starts_stridem, + blocks1a, MDTileInfo, first_tile_dim_log2, SIZES: tl.constexpr, BLOCK_A: tl.constexpr, + BLOCK_N: tl.constexpr, BLOCK_M: tl.constexpr): + """ + This kernel essentially combines 6 different pieces of functionality, + statically branching on the value of tl.program_id(0) to decide which + codepath to take. + + pid == 0: create the token cumsum + 1 <= pid <= SIZES: create a tile cumsum + SIZES < pid < blocks1a: initialise MDTileInfo to 0xffffffff + blocks1a <= pid < blocks1a + n_expts_tot: compute_indx_offs + pid == blocks1a + n_expts_tot: compute_expt_offs + pid > blocks1a + n_expts_tot: initialise Indx to sentinel + + As each of these is a relatively trivial workload, launching them from + this single trampoline is beneficial as they can execute on different + streaming multiprocesses in parallel. + """ + + pid = tl.program_id(0) + + if pid < blocks1a: + _expt_data_memset(ExpertHist, n_expts_tot, MDStarts, tile_starts_stridem, MDTileInfo, first_tile_dim_log2, + SIZES, BLOCK_A) + elif pid == n_expts_tot + blocks1a: + _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size, BLOCK_N) + elif pid < n_expts_tot + blocks1a: + _routing_compute_indx_offs(PartialHist, shape_pm, stride_pm, stride_pn, BLOCK_M, pid - blocks1a) + else: + offs = (pid - n_expts_tot - blocks1a - 1) * BLOCK + tl.arange(0, BLOCK) + mask = offs < size + tl.store(Indx + offs, sentinel, mask=mask) diff --git a/triton_kernels/specialize.py b/triton_kernels/specialize.py new file mode 100644 index 0000000000..9fcb857b29 --- /dev/null +++ b/triton_kernels/specialize.py @@ -0,0 +1,139 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/specialize.py +# Triton is licensed under the MIT License. + +import inspect +import re +import textwrap +import types +import triton + + +def cacheable(f): + """ + A decorator that allow you to write something of the form: + + @cacheable + def my_kernel(): return (expression dynamically defining a kernel) + + such that it interacts gracefully with triton cache and preload. + """ + + g = f() + g.fn.__name__ = f.__name__ + g.fn.__module__ = f.__module__ + g.fn.__qualname__ = f.__qualname__ + g.__name__ = f.__name__ + g.__module__ = f.__module__ + g.__qualname__ = f.__qualname__ + g._fn_name = f"{f.__module__}.{f.__qualname__}" + return g + + +def define_kernel(src, module, attrs=None, **extra_globals): + """ + Dynamically create a Triton function or kernel from a src string, + linking any symbols in the kernel to objects specified by extra_globals. + """ + + # create templace function + def _empty_fn(): + pass + + gdict = dict(**(_empty_fn.__globals__)) + gdict.update(extra_globals) + f = types.FunctionType(_empty_fn.__code__, gdict) + f.__module__ = module.__name__ + + src = textwrap.dedent(src) + src = src[src.find("def "):] + + stored_functions = [] + function_name = src[4:].split("(")[0].strip() + + exec_globals = gdict + exec_globals.update({"stored_functions": stored_functions}) + exec(src + "\n\nstored_functions.append(" + function_name + ")\n", exec_globals) + + f.__signature__ = inspect.signature(stored_functions[0]) + f.__name__ = function_name + f.__doc__ = stored_functions[0].__doc__ + + if attrs is None: + attrs = dict() + f = triton.JITFunction(f, **attrs) + f._unsafe_update_src(src) + return f + + +def specialize(fn, module, constants, tuples, name=None, do_not_specialize=tuple()): + assert isinstance(fn, triton.runtime.jit.JITFunction) + if name is None: + name = f"{fn.__name__}" + # Get original source code + src = inspect.getsource(fn.fn) + src = textwrap.dedent(src) + lines = src.split("\n") + # Skip decorator and def line + def_idx = next(i for i, line in enumerate(lines) if line.strip().startswith("def")) + # separate header vs body LOC + header_end = def_idx + while not lines[header_end].rstrip().endswith(":"): + header_end += 1 + body_lines = lines[header_end + 1:] + header_lines = lines[def_idx:header_end + 1] + # clean-up header + header_clean = [ + l.split("#", 1)[0].strip() # keep code, discard comment + for l in header_lines + if l.split("#", 1)[0].strip() # skip blank‑after‑comment lines + ] + # decompose arguments + header_src = " ".join(header_clean) # turn it into a single line + m = re.search(r"\((.*)\)\s*:", header_src) + if not m: + raise ValueError("Could not parse function header") + args_str = m.group(1) + args = [arg.strip() for arg in args_str.split(",") if arg.strip()] + non_specialized_args = [] + for arg in args: + arg_key = arg.split(":")[0].split("=")[0].strip() + new_args = tuples.get(arg_key, [arg]) + if arg_key not in constants: + non_specialized_args += new_args + # add global symbols + spec_fns = {v.__name__: v for k, v in constants.items() if isinstance(v, triton.runtime.jit.JITFunction)} + globals = spec_fns | fn.get_capture_scope() + # build new source code and define kernel dynamically + new_signature = f"def {name}({', '.join(non_specialized_args)}):" + constexpr_lines = [ + f" {key}: tl.constexpr = {value.__name__ if callable(value) else value}" for key, value in constants.items() + ] + tuple_lines = [ + f" {key} = {'(' + ','.join(value) + (',' if len(value)>=1 else '') + ')'}" for key, value in tuples.items() + ] + new_src = "\n".join(["@triton.jit", new_signature] + constexpr_lines + tuple_lines + body_lines) + # find function parameters + sig = inspect.signature(triton.runtime.jit.JITFunction.__init__) + params = list(sig.parameters.values())[2:] + attrs = {param.name: getattr(fn, param.name, param.default) for param in params} + + # make a new repr which appends the repr of the specialized functions. + base_repr = attrs["repr"] + + def new_repr(specialization): + ret = base_repr(specialization) + for spec_fn in spec_fns.values(): + spec_repr = spec_fn.repr(None) + if spec_repr: + spec_repr = spec_repr.strip("_") + if spec_repr: + ret += f"_{spec_repr}" + return ret + + attrs["repr"] = new_repr + + if do_not_specialize: + attrs["do_not_specialize"] = do_not_specialize + ret = define_kernel(new_src, module, attrs, **globals) + return ret diff --git a/triton_kernels/swiglu.py b/triton_kernels/swiglu.py new file mode 100644 index 0000000000..c1baee40f9 --- /dev/null +++ b/triton_kernels/swiglu.py @@ -0,0 +1,104 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/swiglu.py +# Triton is licensed under the MIT License. + +from dataclasses import dataclass +from triton_kernels.numerics import InFlexData, OutFlexData +import torch +import triton +from .swiglu_details._swiglu import _swiglu, _swiglu_fn +from triton_kernels import target_info + + +@dataclass(frozen=True) +class FlexCtx: + out_data: OutFlexData = OutFlexData() + inp_data: InFlexData = InFlexData() + saturate_inf: bool = False + + +@dataclass(frozen=True) +class PrecisionConfig: + limit: float + flex_ctx: FlexCtx = FlexCtx() + + +swiglu_fn = _swiglu_fn + + +class SwiGLU(torch.autograd.Function): + + @staticmethod + def forward(ctx, a, alpha, precision_config, routing_data): + N = a.shape[-1] + M = a.numel() // N + assert a.stride()[-1] == 1 + assert a.shape[-1] % 2 == 0 + out = torch.empty(size=(M, N // 2), dtype=a.dtype, device=a.device) + flex_ctx = precision_config.flex_ctx + # optimization hyperparameters + BLOCK_M, BLOCK_N = 32 // a.itemsize, 128 + num_warps = 4 + kwargs = {'maxnreg': 64} if not target_info.is_hip() else {} + # launch semi-persistent kernel + N_BLOCKS = triton.cdiv(N // 2, BLOCK_N) + num_sms = target_info.num_sms() + if routing_data is not None: + waves_per_sm = 32 if target_info.is_hip() else 128 + num_pid = num_sms * (waves_per_sm // num_warps) + M_BLOCKS = max(1, triton.cdiv(num_pid, N_BLOCKS)) + grid = (min(M_BLOCKS * N_BLOCKS, 4 * num_sms), ) + else: + M_BLOCKS = triton.cdiv(M, BLOCK_M) + if M_BLOCKS * N_BLOCKS >= 8 * num_sms: + grid = (8 * num_sms, ) + else: + grid = (min(M_BLOCKS * N_BLOCKS, 4 * num_sms), ) + n_tokens = None + if routing_data is not None: + n_tokens = routing_data.expt_data.token_offs_raw[routing_data.n_expts_tot] + _swiglu[grid]( + flex_ctx.out_data.reinterpret(out), + flex_ctx.out_data.expected_scale, + flex_ctx.out_data.actual_scale, + flex_ctx.out_data.checksum_scale, + flex_ctx.inp_data.reinterpret(a), + flex_ctx.inp_data.scale, + alpha, + M, + N // 2, + a.shape[-1], + 1, + out.shape[-1], + 1, + precision_config.limit, + n_tokens, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + EVEN_N=(N // 2) % BLOCK_N == 0, + M_BLOCKS=M_BLOCKS, + N_BLOCKS=N_BLOCKS, + flexpoint_saturate_inf=flex_ctx.saturate_inf, + num_warps=num_warps, + **kwargs, + ) + out = out.view(a.shape[:-1] + out.shape[-1:]) + return out + + +def swiglu(a, alpha, precision_config, routing_data=None): + return SwiGLU.apply(a, alpha, precision_config, routing_data) + + +def swiglu_torch(a, alpha, precision_config): + limit = precision_config.limit + a_gelu = a[..., ::2] + if limit is not None: + a_gelu = a_gelu.clamp(max=limit) + a_linear = a[..., 1::2] + if limit is not None: + a_linear = a_linear.clamp(min=-limit, max=limit) + + out_gelu = a_gelu * torch.sigmoid(alpha * a_gelu) + out = out_gelu * (a_linear + 1) + return out diff --git a/triton_kernels/swiglu_details/__init__.py b/triton_kernels/swiglu_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/swiglu_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/swiglu_details/_swiglu.py b/triton_kernels/swiglu_details/_swiglu.py new file mode 100644 index 0000000000..b2a907e074 --- /dev/null +++ b/triton_kernels/swiglu_details/_swiglu.py @@ -0,0 +1,106 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/swiglu_details/_swiglu.py +# Triton is licensed under the MIT License. + +from triton_kernels.numerics_details.flexpoint import load_scale, float_to_flex, update_scale +import triton +import triton.language as tl + + +@triton.jit +def clip(x, limit, clip_lower: tl.constexpr): + res = tl.minimum(x, limit) + if clip_lower: + res = tl.maximum(-limit, res) + return res + + +@triton.jit +def thread_local_absmax(x, BLOCK_SIZE: tl.constexpr, NUM_THREADS: tl.constexpr): + return tl.max(tl.reshape(tl.abs(x), [NUM_THREADS, BLOCK_SIZE // NUM_THREADS], can_reorder=True), axis=1) + + +def swiglu_repr(specialization): + signature = specialization.signature + constants = specialization.constants + convert_dtype = lambda dtype: "mxfp4" if "u8" in dtype else dtype + dtypes = "x".join([convert_dtype(f"{signature[i][1:]}") for i in ["Out", "A"]]) + blocks = "x".join([f"{constants[i]}" for i in ["BLOCK_M", "BLOCK_N"]]) + return f"_swiglu_{dtypes}_{blocks}" + + +def swiglu_launch_metadata(grid, kernel, args): + M, N = args["M"], args["N"] + ret = dict() + ret["name"] = f"{kernel.name} [M = {M}, N = {N}]" + A, Out = args["A"], args["Out"] + ret["bytes"] = Out.numel() * Out.element_size() + A.numel() * A.element_size() + return ret + + +@triton.jit +def compute_swiglu(gelu, linear, scale, alpha, limit): + gelu = gelu.to(tl.float32) * scale + if limit is not None: + gelu = clip(gelu, limit, clip_lower=False) + linear = linear.to(tl.float32) * scale + if limit is not None: + linear = clip(linear, limit, clip_lower=True) + s = gelu / (1 + tl.exp(-alpha * gelu)) + return tl.fma(s, linear, s) # (s * (linear + 1)) + + +@triton.jit(repr=lambda _: "_swiglu") +def _swiglu_fn(input, alpha, limit): + gelu, linear = tl.split(tl.reshape(input, (input.shape[0], input.shape[1] // 2, 2))) + return compute_swiglu(gelu, linear, 1.0, alpha, limit) + + +@triton.jit(repr=swiglu_repr, launch_metadata=swiglu_launch_metadata) +def _swiglu(Out, OutExpectedScale, OutActualScale, OutChecksumScale, A, AScale, alpha, M, N, stride_am, stride_an, + stride_outm, stride_outn, limit: tl.constexpr, NTokens, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, + EVEN_N: tl.constexpr, M_BLOCKS, N_BLOCKS, flexpoint_saturate_inf: tl.constexpr): + if NTokens is not None: + M = tl.load(NTokens) + M_BLOCKS = (M + BLOCK_M - 1) // BLOCK_M + + local_max = tl.full([tl.extra.cuda.num_threads()], 0.0, tl.float32) + + a_scale = load_scale(AScale) + out_expected_scale = load_scale(OutExpectedScale) + + for pid in tl.range(tl.program_id(0), M_BLOCKS * N_BLOCKS, tl.num_programs(0), num_stages=2): + pid_m = (pid // N_BLOCKS) + pid_n = (pid % N_BLOCKS) + off_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + mask_m = off_m < M + mask_n = off_n < N + packed_off_n = pid_n * BLOCK_N + tl.arange(0, 2 * BLOCK_N) // 2 + packed_mask_n = packed_off_n < N + packed_mask_n = tl.max_constancy(packed_mask_n, [16]) + # load a + packed_off_n = pid_n * 2 * BLOCK_N + tl.arange(0, 2 * BLOCK_N) + packed_offs = off_m[:, None] * stride_am + packed_off_n[None, :] * stride_an + if EVEN_N: + a_packed = tl.load(A + packed_offs, mask=mask_m[:, None], other=0.) + else: + if pid_n * BLOCK_N + BLOCK_N <= N: + a_packed = tl.load(A + packed_offs, mask=mask_m[:, None], other=0.) + else: + packed_mask = mask_m[:, None] & packed_mask_n[None, :] + a_packed = tl.load(A + packed_offs, mask=packed_mask, other=0.) + a_gelu, a_linear = tl.split(tl.reshape(a_packed, (BLOCK_M, BLOCK_N, 2))) + out = compute_swiglu(a_gelu, a_linear, a_scale, alpha, limit) + # update flexpoint stats and divide by scale + # we don't need masking because of the `other` when loading `A` + if OutActualScale is not None: + absmax = thread_local_absmax(out, out.numel, tl.extra.cuda.num_threads()) + local_max = tl.maximum(local_max, absmax) + out = float_to_flex(out, out_expected_scale, + None, # ActualScale: local absmax is tracked and updated after the loop + OutChecksumScale, None, Out, flexpoint_saturate_inf) + mask = mask_m[:, None] if EVEN_N else mask_m[:, None] & mask_n[None, :] + tl.store(Out + off_m[:, None] * stride_outm + off_n[None, :] * stride_outn, out, mask) + + update_scale(local_max, OutActualScale, Out) diff --git a/triton_kernels/target_info.py b/triton_kernels/target_info.py new file mode 100644 index 0000000000..6966f63d62 --- /dev/null +++ b/triton_kernels/target_info.py @@ -0,0 +1,58 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/target_info.py +# Triton is licensed under the MIT License. + +import torch +import triton +import triton.language as tl + +from triton.language.target_info import ( + cuda_capability_geq, + is_cuda, + is_hip, + is_hip_cdna3, + is_hip_cdna4, +) + +__all__ = [ + "cuda_capability_geq", + "get_cdna_version", + "has_tma_gather", + "has_native_mxfp", + "is_cuda", + "is_hip", + "is_hip_cdna3", + "is_hip_cdna4", + "num_sms", +] + + +@triton.constexpr_function +def get_cdna_version(): + """ + Gets the AMD architecture version, i.e. CDNA3 or CDNA4, currently + only supports 3 (gfx942) or 4 (gfx950). Returns -1 if it is not AMD + hardware or unsupported architecture + """ + target = tl.target_info.current_target() + if target.backend != 'hip': + return -1 + if target.arch == 'gfx942': + return 3 + if target.arch == 'gfx950': + return 4 + return -1 + + +@triton.constexpr_function +def has_tma_gather(): + return cuda_capability_geq(10, 0) + + +@triton.constexpr_function +def has_native_mxfp(): + return cuda_capability_geq(10, 0) + + +def num_sms(): + return torch.cuda.get_device_properties(0).multi_processor_count diff --git a/triton_kernels/tensor.py b/triton_kernels/tensor.py new file mode 100644 index 0000000000..ae68a89896 --- /dev/null +++ b/triton_kernels/tensor.py @@ -0,0 +1,219 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor.py +# Triton is licensed under the MIT License. + +from dataclasses import dataclass, fields +from typing import Type + +import torch +from triton.tools.tensor_descriptor import TensorDescriptor +from triton.tools.ragged_tma import create_ragged_descriptor + +from .reduction_details.reduce_bitmatrix import clear_sums, sum_bitmatrix_rows +from .target_info import cuda_capability_geq +from .tensor_details.layout import Layout, StridedLayout + + +@dataclass +class Storage: + data: torch.Tensor + layout: Layout = None + + def __post_init__(self): + assert isinstance(self.data, torch.Tensor) + if self.layout is None: + self.layout = StridedLayout(self.data.shape) + + @property + def device(self): + return self.data.device + + def is_tma_compliant(self): + # TMAs didn't exist until Hopper + if not cuda_capability_geq(9, 0): + return False + # TMAs only exist for 2D, 3D, 5D inputs + if len(self.data.shape) not in [2, 3, 5]: + return False + # TMAs need at most one stride equal to 1 + # and all other strides divisble by 16 + strides = list(self.data.stride()) + try: + major_dim = strides.index(1) + except ValueError: + major_dim = -1 + ndim = self.data.ndim + bitwidth = 4 if self.data.dtype == torch.uint8 else self.data.element_size() * 8 + compliant = [strides[i] * bitwidth % 128 == 0 for i in range(ndim) if i != major_dim] + return all(compliant) + + def make_dense_tma(self, block_shape, transpose=False): + strides = list(self.data.stride()) + shape = list(self.data.shape) + transpose = self.data.stride()[-1] != 1 + if transpose: + block_shape = block_shape[:-2] + [block_shape[-1], block_shape[-2]] + shape = shape[:-2] + [shape[-1], shape[-2]] + strides = strides[:-2] + [strides[-1], strides[-2]] + if self.data.dtype == torch.uint8 and self.layout.name == "BLACKWELL_VALUE": + indx = strides.index(1) + block_shape[indx] = block_shape[indx] // 2 + if shape[-1] % 128 != 0: + raise ValueError("inner shape need to be multiple of 128 for " + "mxfp4 (CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B) TMAs.") + block_shape = self.layout.swizzle_block_shape(block_shape) + return TensorDescriptor(self.data, shape, strides, block_shape) + + def make_tma(self, block_shape, mode, transpose=False): + if mode in ["dense", "gather", "scatter"]: + return self.make_dense_tma(block_shape, transpose) + assert mode == "ragged" + ragged_dim = len(self.data.shape) - 2 + return create_ragged_descriptor(self.data, block_shape, ragged_dim=ragged_dim) + + +@dataclass +class IntegerType: + bitwidth: int + + +@dataclass +class FloatType: + bitwidth_exponent: int + bitwidth_mantissa: int + is_signed: bool + + def __post_init__(self): + self.bitwidth = int(self.is_signed) + self.bitwidth_exponent + self.bitwidth_mantissa + + +BIT = IntegerType(1) +FP4 = FloatType(bitwidth_exponent=2, bitwidth_mantissa=1, is_signed=True) + + +def bitwidth(type: IntegerType | FloatType | torch.dtype): + if isinstance(type, torch.dtype): + return type.itemsize * 8 + return type.bitwidth + + +@dataclass +class Tensor: + storage: Storage | torch.Tensor + dtype: IntegerType | FloatType | torch.dtype = None + shape: list[int] | None = None + shape_max: list[int] | None = None + + def __post_init__(self): + # set storage + if isinstance(self.storage, torch.Tensor): + self.storage = Storage(self.storage) + # initialize dtype + if self.dtype is None: + self.dtype = self.storage.data.dtype + if bitwidth(self.dtype) < 8 and self.shape is None: + raise ValueError("shape must be provided for sub-byte types") + # initialize shape + if self.shape is None: + self.shape = list(self.storage.data.shape) + # validate shape: all elements must be `int` or numel-1 `torch.Tensor` + is_int = lambda s: isinstance(s, int) + is_item = lambda s: hasattr(s, "numel") and s.numel() == 1 + assert all(map(lambda s: is_int(s) or is_item(s), self.shape)) + # initialize shape_max + if self.shape_max is None: + self.shape_max = [None] * len(self.shape) + for i, (s, smax) in enumerate(zip(self.shape, self.shape_max)): + if smax is not None and not is_int(smax): + raise ValueError(f"shape_max[{i}] must be `int` or `None`; got {type(smax)}") + if smax is None: + self.shape_max[i] = s + # validate shape_max: all elements must be `int` + assert all(map(is_int, self.shape_max)) + + # torch compatibility layer + @property + def ndim(self): + return len(self.shape) + + @property + def device(self): + return self.storage.device + + def stride(self, i=None): + return self.storage.data.stride() if i is None else self.storage.data.stride(i) + + def data_ptr(self): + return self.storage.data.data_ptr() + + def numel(self): + return self.storage.data.numel() + + def element_size(self): + return bitwidth(self.dtype) // 8 + + @property + def data(self): + t = self.storage + return t.data if isinstance(t, Storage) else t + + def dim(self): + return self.ndim + + def size(self, i=None): + if i is None: + return self.shape + return self.shape[i] + + +@dataclass +class Bitmatrix(Tensor): + """ + Represents a boolean matrix in a packed format where each element occupies + a single bit of memory. + + _scratchpad is either None or an all-zero array of size >= shape[-1]; we pass it along + with the actual bitmatrix to avoid having to launch a separate memset + kernel when we call Bitmatrix::sum(). + """ + + scratchpad: torch.Tensor = None + + def __init__(self, storage, shape, shape_max=None, scratchpad=None): + super().__init__(storage, dtype=BIT, shape=shape, shape_max=shape_max) + self.scratchpad = scratchpad + + def sum(self, partials_block_size): + _, n_cols = self.shape + dev = self.device + if self.scratchpad is None: + self.scratchpad = clear_sums(n_cols, dev) + out_ret = self.scratchpad[:n_cols] + self.scratchpad = None # throw error if we try to sum again + return sum_bitmatrix_rows(self, out_ret, partials_block_size) + + +def get_layout(tensor: torch.Tensor | Tensor | None): + if tensor is None: + return None + if isinstance(tensor, Tensor): + return tensor.storage.layout + return StridedLayout + + +def wrap_torch_tensor(torch_tensor, dtype=None): + if dtype is None: + dtype = torch_tensor.dtype + shape = list(torch_tensor.shape) + shape[torch_tensor.stride().index(1)] *= bitwidth(torch_tensor.dtype) // bitwidth(dtype) + return Tensor(Storage(torch_tensor), dtype=dtype, shape=shape) + + +def convert_layout(tensor: Tensor, layout_cls: Type[Layout], **layout_kwargs): + assert isinstance(tensor, Tensor) + old_storage = tensor.storage + old_data = old_storage.layout.unswizzle_data(old_storage.data) + new_layout = layout_cls(old_data.shape, **layout_kwargs) + new_data = new_layout.swizzle_data(old_data) + attrs = {k.name: getattr(tensor, k.name) for k in fields(tensor) if k.name != "storage"} + return Tensor(Storage(new_data, new_layout), **attrs) diff --git a/triton_kernels/tensor_details/__init__.py b/triton_kernels/tensor_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/tensor_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/tensor_details/layout.py b/triton_kernels/tensor_details/layout.py new file mode 100644 index 0000000000..9670c096ad --- /dev/null +++ b/triton_kernels/tensor_details/layout.py @@ -0,0 +1,44 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout.py +# Triton is licensed under the MIT License. + +from .layout_details.base import Layout +from .layout_details.blackwell_scale import BlackwellMXScaleLayout +from .layout_details.blackwell_value import BlackwellMXValueLayout +from .layout_details.hopper_scale import HopperMXScaleLayout +from .layout_details.hopper_value import HopperMXValueLayout +from .layout_details.cdna4_scale import CDNA4MXScaleLayout +from .layout_details.strided import StridedLayout +from ..target_info import cuda_capability_geq, is_hip_cdna4 + +__all__ = [ + "Layout", + "BlackwellMXValueLayout", + "BlackwellMXScaleLayout", + "HopperMXScaleLayout", + "HopperMXValueLayout", + "CDNA4MXScaleLayout", + "StridedLayout", +] + + +def make_default_matmul_mxfp4_w_layout(mx_axis: int): + if cuda_capability_geq(10): + # return StridedLayout, dict() + return BlackwellMXValueLayout, dict() + elif cuda_capability_geq(9): + return HopperMXValueLayout, {"mx_axis": mx_axis} + else: + return StridedLayout, dict() + + +def make_default_matmul_mxfp4_w_scale_layout(mx_axis: int, num_warps: int = 8): + if is_hip_cdna4(): + return CDNA4MXScaleLayout, dict() + else: + if cuda_capability_geq(10): + return BlackwellMXScaleLayout, dict() + elif cuda_capability_geq(9): + return HopperMXScaleLayout, {"mx_axis": mx_axis, "num_warps": num_warps} + + return StridedLayout, dict() diff --git a/triton_kernels/tensor_details/layout_details/__init__.py b/triton_kernels/tensor_details/layout_details/__init__.py new file mode 100644 index 0000000000..2d460d330a --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/__init__.py @@ -0,0 +1 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. diff --git a/triton_kernels/tensor_details/layout_details/base.py b/triton_kernels/tensor_details/layout_details/base.py new file mode 100644 index 0000000000..3d725ac923 --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/base.py @@ -0,0 +1,23 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/base.py +# Triton is licensed under the MIT License. + +from abc import ABC, abstractmethod + + +class Layout(ABC): + + def __init__(self, shape) -> None: + self.initial_shape = shape + + @abstractmethod + def swizzle_data(self, data): + pass + + @abstractmethod + def unswizzle_data(self, data): + pass + + @abstractmethod + def swizzle_block_shape(self, block_shape): + pass diff --git a/triton_kernels/tensor_details/layout_details/blackwell_scale.py b/triton_kernels/tensor_details/layout_details/blackwell_scale.py new file mode 100644 index 0000000000..3ff602577a --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/blackwell_scale.py @@ -0,0 +1,62 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_scale.py +# Triton is licensed under the MIT License. + +import math +import triton +import triton.language as tl +import torch +from .base import Layout + +SWIZZLE_ALIGN_INNER = 8 +SWIZZLE_SIZE_INNER = 4 +SWIZZLE_SIZE_OUTER = 128 + + +class BlackwellMXScaleLayout(Layout): + name: str = "BLACKWELL_SCALE" + + def __init__(self, shape) -> None: + super().__init__(shape) + *self.leading_shape, self.K, self.N, = shape + self.B = math.prod(self.leading_shape) + self.ALIGN_K = 8 + self.ALIGN_N = 128 + self.SWIZZLE_K = 4 + self.K_pad = (self.K + self.ALIGN_K - 1) // self.ALIGN_K * self.ALIGN_K + self.N_pad = (self.N + self.ALIGN_N - 1) // self.ALIGN_N * self.ALIGN_N + + def swizzle_data(self, data): + data = torch.nn.functional.pad(data, (0, self.N_pad - self.N, 0, self.K_pad - self.K)) + data = data.transpose(-1, -2).contiguous() + data = data.reshape(self.B, self.N_pad // self.ALIGN_N, self.ALIGN_N // 32, 32, self.K_pad // self.SWIZZLE_K, + self.SWIZZLE_K) + data = data.transpose(2, 4).contiguous() + data = data.view(1, self.B * self.N_pad // 128, self.K_pad // 4, 2, 256) + return data + + def unswizzle_data(self, data): + data = data.reshape(self.B, self.N_pad // self.ALIGN_N, self.K_pad // self.SWIZZLE_K, 32, self.ALIGN_N // 32, + self.SWIZZLE_K) + data = data.transpose(2, 4) + data = data.reshape(*self.leading_shape, self.N_pad, self.K_pad) + data = data.transpose(-1, -2) + return data[..., :self.K, :self.N] + + def swizzle_block_shape(self, block_shape): + MX_PACK_DIVISOR = 32 + MX_SCALE_BLOCK_K = block_shape[1] // MX_PACK_DIVISOR + return [1, block_shape[0] // 128, MX_SCALE_BLOCK_K // 4, 2, 256] + + +@triton.jit +def unswizzle_mx_scale_bw(x, SIZE_OUTER: tl.constexpr = SWIZZLE_SIZE_OUTER, + SIZE_INNER: tl.constexpr = SWIZZLE_SIZE_INNER, + ALIGN_INNER: tl.constexpr = SWIZZLE_ALIGN_INNER): + shape_0: tl.constexpr = x.shape[0] + shape_1: tl.constexpr = x.shape[1] + tl.static_assert(shape_1 % SIZE_OUTER == 0) + tl.static_assert(shape_1 // SIZE_OUTER <= ALIGN_INNER) + x = x.reshape(shape_0, (shape_1 // SIZE_OUTER) // SIZE_INNER, 32, SIZE_OUTER // 32, SIZE_INNER) + x = x.trans(0, 3, 2, 1, 4).reshape(shape_0 * SIZE_OUTER, shape_1 // SIZE_OUTER) + return x diff --git a/triton_kernels/tensor_details/layout_details/blackwell_value.py b/triton_kernels/tensor_details/layout_details/blackwell_value.py new file mode 100644 index 0000000000..15a0e4783b --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/blackwell_value.py @@ -0,0 +1,37 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/blackwell_value.py +# Triton is licensed under the MIT License. + +import torch +from .base import Layout + + +class BlackwellMXValueLayout(Layout): + name: str = "BLACKWELL_VALUE" + + def __init__(self, shape) -> None: + super().__init__(shape) + self.shape = shape + + def swizzle_data(self, data): + # permutation needed to make `data` row major + to_row_major = sorted(range(data.ndim), key=lambda d: (data.stride(d), d))[::-1] + # permutation needed to retrieve original order + inv = [0] * data.ndim + for i, d in enumerate(to_row_major): + inv[d] = i + # leading dimension must be padded to be aligned to 128 + align_dim = lambda x: (x + 128 - 1) // 128 * 128 + major_dim = data.stride().index(1) + pad = align_dim(data.shape[major_dim]) - data.shape[major_dim] + data = torch.nn.functional.pad(data.permute(to_row_major), (0, pad)).permute(inv) + return data + + def unswizzle_data(self, data: torch.Tensor): + # Trim padding along all dims back to the original shape recorded at init. + assert data.ndim == len(self.shape), "Rank mismatch between data and recorded shape" + sizes = [min(data.size(i), self.shape[i]) for i in range(data.ndim)] + return data[tuple(slice(0, s) for s in sizes)] + + def swizzle_block_shape(self, block_shape): + return block_shape diff --git a/triton_kernels/tensor_details/layout_details/cdna4_scale.py b/triton_kernels/tensor_details/layout_details/cdna4_scale.py new file mode 100644 index 0000000000..87dbd6b377 --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/cdna4_scale.py @@ -0,0 +1,48 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/cdna4_scale.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl +from .base import Layout + +NON_K_PRESHUFFLE_BLOCK_SIZE = 32 + + +class CDNA4MXScaleLayout(Layout): + name: str = "CDNA4_SCALE" + + def __init__(self, shape) -> None: + super().__init__(shape) + + def swizzle_data(self, data): + block_shape = data.shape + SCALE_K = block_shape[-2] + N = block_shape[-1] + data = data.transpose(-1, -2) + data = data.view(-1, N // NON_K_PRESHUFFLE_BLOCK_SIZE, 2, 16, SCALE_K // 8, 2, 4, 1) + data = data.permute(0, 1, 4, 6, 3, 5, 2, 7).contiguous() + if len(block_shape) == 3: + E = block_shape[0] + data = data.reshape(E, N // 32, SCALE_K * 32) + else: + assert len(block_shape) == 2 + data = data.reshape(N // 32, SCALE_K * 32) + return data.transpose(-1, -2) + + def unswizzle_data(self, data): + raise NotImplementedError() + + def swizzle_block_shape(self, block_shape): + SCALE_K = block_shape[-2] + N = block_shape[-1] + return block_shape[:-2] + [N // 32, SCALE_K * 32] + + +@triton.jit +def unswizzle_mx_scale_cdna4(x, BLOCK_N: tl.constexpr, MX_SCALE_BLOCK_K: tl.constexpr, + N_PRESHUFFLE_FACTOR: tl.constexpr = NON_K_PRESHUFFLE_BLOCK_SIZE): + x = x.reshape(BLOCK_N // N_PRESHUFFLE_FACTOR, MX_SCALE_BLOCK_K // 8, 4, 16, 2, 2, 1) + x = x.permute(0, 5, 3, 1, 4, 2, 6) + x = x.reshape(BLOCK_N, MX_SCALE_BLOCK_K) + return x diff --git a/triton_kernels/tensor_details/layout_details/hopper_scale.py b/triton_kernels/tensor_details/layout_details/hopper_scale.py new file mode 100644 index 0000000000..feb342fc88 --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/hopper_scale.py @@ -0,0 +1,84 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_scale.py +# Triton is licensed under the MIT License. + +import torch +import triton +import triton.language as tl +from .base import Layout + + +class HopperMXScaleLayout(Layout): + name: str = "HOPPER_SCALE" + + def __init__(self, shape, mx_axis, num_warps=8) -> None: + assert num_warps & (num_warps - 1) == 0, "warps_n must be a power of 2" + super().__init__(shape) + self.mx_axis = mx_axis + self.num_warps = num_warps + *self.leading_shape, _, _ = shape + + def _maybe_mT(self, data): + if self.mx_axis == len(self.leading_shape): + return data.contiguous().mT + return data + + def swizzle_data(self, data): + data = self._maybe_mT(data).contiguous() + *batch, M, K = data.shape + SWIZZLE_ALIGN_M = 2 * self.num_warps * 2 * 8 + SWIZZLE_ALIGN_K = 2 + pad_m = (SWIZZLE_ALIGN_M - (M % SWIZZLE_ALIGN_M)) % SWIZZLE_ALIGN_M + pad_k = (SWIZZLE_ALIGN_K - (K % SWIZZLE_ALIGN_K)) % SWIZZLE_ALIGN_K + data = torch.nn.functional.pad(data, (0, pad_k, 0, pad_m)) + *batch, M, K = data.shape + assert data.is_contiguous() + assert M % ( + 2 * self.num_warps * 2 * + 8) == 0 and K % 2 == 0, f"Input tensor must have a subtile of shape (..., {2 * self.num_warps * 2 * 8}, 2)" + b = len(batch) + data = data.reshape(*batch, M // (2 * self.num_warps * 2 * 8), 2, self.num_warps, 2, 8, K // 2, 2) + perm = [0, 2, 5, 1, 4, 6, 3] + perm = list(range(b)) + [b + p for p in perm] + data = data.permute(*perm) + data = data.flatten(-5, -1) + data = data.flatten(-3, -2) + assert data.shape[-2] == M // 32 + assert data.shape[-1] == K * 32 + data = self._maybe_mT(data) + return data + + def unswizzle_data(self, data): + data = self._maybe_mT(data) + *batch, M, K = data.shape + b = len(batch) + data = data.reshape(*batch, M // self.num_warps, self.num_warps, K // 64, 2, 8, 2, 2) + perm = [0, 3, 1, 6, 4, 2, 5] + perm = list(range(b)) + [b + p for p in perm] + data = data.permute(*perm) + data = data.reshape(*batch, M * 32, K // 32) + data = self._maybe_mT(data) + return data + + def swizzle_block_shape(self, block_shape): + return block_shape + + +@triton.jit +def unswizzle_mxfp4_scale_hopper(x, mx_axis: tl.constexpr, num_warps: tl.constexpr): + """ + Triton inverse of swizzle_mxfp4_scale_hopper + """ + tl.static_assert(len(x.shape) == 2, "NYI") + # implementation assumes mxfp data is packed along the last dimension + x = x.trans() if mx_axis == 0 else x + M: tl.constexpr = x.shape[0] + K: tl.constexpr = x.shape[1] + tl.static_assert(M % num_warps == 0, f"M must be divisible by {num_warps}. Got {M}") + tl.static_assert(K % 64 == 0, f"K must be divisible by 64. Got {K}") + x = x.reshape(M // num_warps, num_warps, K // 64, 2, 8, 2, 2) + x = x.trans(0, 3, 1, 6, 4, 2, 5) + x = x.reshape(M * 32, K // 32) + # implementation assumed mxfp data is packed along the last dimension + x = x.trans() if mx_axis == 0 else x + return x diff --git a/triton_kernels/tensor_details/layout_details/hopper_value.py b/triton_kernels/tensor_details/layout_details/hopper_value.py new file mode 100644 index 0000000000..bf38b71b51 --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/hopper_value.py @@ -0,0 +1,327 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py +# Triton is licensed under the MIT License. + +import torch +import triton +import triton.language as tl +from .base import Layout + + +def right_shift_unsigned(x, shift): + return (x >> shift) & ((1 << (32 - shift)) - 1) + + +# ----------------------------------------------------------------------- +# Interleave the bits of four consecutive fp4 values (i.e. 16-bits) as: +# 1000000111000000 (first fp4) +# 1000000111000000 (second fp4) +# 1000000111000000 (third fp4) +# 0110110000000000 (fourth fp4) +# This is done so that dequantization can be done in 14 SASS instructions +# ----------------------------------------------------------------------- + + +def _compress_fp4(x): + x = x.to(torch.int32) + return ((x & 0x8) << 12) | ((x & 0x7) << 6) + + +def _compress_fourth(x): + x = x.to(torch.int32) + return ((x & 0x8) << 11) | ((x & 0x6) << 9) | ((x & 0x1) << 13) + + +def _pack_bits(x: torch.Tensor, mx_axis: int): + x = x.contiguous() + assert x.shape[-1] % 4 == 0, "Input tensor must have a last dimension divisible by 4" + x = x.reshape(x.shape[:-1] + (x.shape[-1] // 4, 4)) + first = _compress_fp4(x[..., 0]) | (_compress_fp4(x[..., 0] >> 4) << 16) + second = _compress_fp4(x[..., 1]) | (_compress_fp4(x[..., 1] >> 4) << 16) + third = _compress_fp4(x[..., 2]) | (_compress_fp4(x[..., 2] >> 4) << 16) + fourth = _compress_fourth(x[..., 3]) | (_compress_fourth(x[..., 3] >> 4) << 16) + x = first | right_shift_unsigned(second, 3) | right_shift_unsigned(third, 6) | fourth + assert x.is_contiguous() + x = x.view(torch.uint8) + return x + + +# ----------------------------------------------------------------------- +# inverse operation of _pack_bits +# ----------------------------------------------------------------------- + + +def _bf16_to_fp4e2m1(x): + # 0bAxxxxxxBCDxxxxxx (int16) -> 0b0000ABCD (uint8) + assert x.dtype == torch.int16 + s = (right_shift_unsigned(x, 15) & 0x1) << 3 + em = right_shift_unsigned(x, 6) & 0x7 + return (s | em).to(torch.uint8) + + +def _bf16x2_to_fp4e2m1x2(x): + # 0bAxxxxxxBCDxxxxxx_0bExxxxxxFGHxxxxxx (int32) -> 0bABCD_EFGH (uint8) + assert x.dtype == torch.int32 + lo = (x & 0xFFFF).to(torch.int16) + hi = (right_shift_unsigned(x, 16) & 0xFFFF).to(torch.int16) + ret_lo = _bf16_to_fp4e2m1(lo) + ret_hi = _bf16_to_fp4e2m1(hi) + return ret_lo | (ret_hi << 4) + + +def _unpack_bits(x, mx_axis: int): + x = x.view(torch.int32) + m = 0b10000001110000001000000111000000 + a = (x << 1) & 0b10000000000000001000000000000000 + b = right_shift_unsigned(x, 3) & 0b00000001100000000000000110000000 + c = right_shift_unsigned(x, 7) & 0b00000000010000000000000001000000 + unpacked = [x & m, (x << 3) & m, (x << 6) & m, (a | b) | c] + x = torch.stack(unpacked, dim=-1) + x = x.flatten(-2, -1) + x = _bf16x2_to_fp4e2m1x2(x) + return x + + +# ----------------------------------------------------------------------- + + +class HopperMXValueLayout(Layout): + name: str = "HOPPER_VALUE" + + def __init__(self, shape, mx_axis, mma_version=3): + super().__init__(shape) + assert mx_axis in range(len(shape)) + self.mx_axis = mx_axis + self.mma_version = mma_version + *self.leading_shape, self.K, self.N, = shape + + def _maybe_mT(self, data): + if self.mx_axis == len(self.leading_shape): + return data.mT + return data + + def swizzle_data(self, data): + """ + Given a uint8 tensor of shape (*, M, K), returns a tensor of shape + (*, M // 4, K * 4) such that: + + 1) Groups contiguously all the elements owned by the same thread of 4 + mma tiles along the K axis. The following animation shows a similar + grouping for 2 tiles along M and 2 tiles along K rather than 4 along K + as done here: + https://neuralmagic.com/wp-content/uploads/2024/10/animation_4.gif + + 2) Moves the elements belonging to thread 4-7 to be contiguous with those + from thread 0-3. This is done to get a full cache line when loading them + from HBM. + + mx_axis selects the lhs or rhs of the matmul. + + WARNING: Assumes that the matmul will be done in bf16 or fp16! + Implementing it for fp8 is as easy as making the tile size (8, 8) + """ + batch = data.ndim - 2 + assert batch >= 0 + assert self.mma_version in (2, 3) + data = self._maybe_mT(data) + init_shape = data.shape + + # We are loading 8 bf16 elements per thread to use ld.global.v4 + # Every u8 represents 2 mxfp4 elements + u8_kwidth = 8 // 2 if self.mma_version == 2 else 1 + + # Pack the 4 // u8_kwidth subtiles of an mma into a u4x8 + contig = (1, u8_kwidth) + scott_trick = (2, 1) + threads = (4, 4) + warp_tile = (2, 2) + k_tile = (1, 4 // u8_kwidth) + + sizes = list(data.shape[:-2]) + pads = [] + # [rest, K, tile, threads] per dimension + for i, (a, b, c, s, d) in enumerate(zip(k_tile, warp_tile, threads, scott_trick, contig)): + pack = a * b * c * s * d + size = data.shape[batch + i] + pad = (pack - size % pack) % pack + pads += [(0, pad)] + sizes.append((size + pad) // pack) + sizes += [a, b, c, s, d] + + pads = tuple(x for t in pads[::-1] for x in t) + data = torch.nn.functional.pad(data, pads) + init_shape = data.shape + # 0: rest[0] + # 1: k_tile[0] + # 2: warp_tile[0] + # 3: threads[0] + # 4: scott_trick[0] + # 5: contig[0] + # 6: rest[1] + # 7: k_tile[1] + # 8: warp_tile[1] + # 9: threads[1] + # 10: scott_trick[1] + # 11: contig[1] + data = data.view(*sizes) + # Want [rest[0], threads[0], rest[1], scott_trick[0], scott_trick[0], threads[1], contig[1], contig[0], k_tile[1], k_tile[0], warp_tile[1], warp_tile[0]] + perm = [0, 3, 6, 10, 4, 9, 7, 1, 8, 2, 5, 11] + perm = list(range(batch)) + [batch + p for p in perm] + data = data.permute(*perm).contiguous() + # These are views + data = data.flatten(-10, -1) + data = data.flatten(-3, -2) + assert data.is_contiguous() + assert data.shape[-2] == init_shape[-2] // 4 + assert data.shape[-1] == init_shape[-1] * 4 + # twiddle the bits + data = _pack_bits(data, self.mx_axis) + data = self._maybe_mT(data) + return data + + def unswizzle_data(self, data): + data = self._maybe_mT(data) + data = _unpack_bits(data, self.mx_axis) + *batch, M, K = data.shape + # We have two times the elements if we already upcasted to bfloat16 + mult = 2 if data.dtype == torch.bfloat16 else 1 + assert M % 4 == 0, "M must be divisible by 4" + assert K % (4 * 8 * 2 * 2 * mult) == 0, f"K must be divisible by {4 * 8 * 2 * 2 * mult}" + # We are loading 8 bf16 elements per thread to use ld.global.v4 + # Every u8 represents 2 mxfp4 elements + u8_kwidth = 8 // 2 if self.mma_version == 2 else 1 + data = data.reshape(*batch, M // 4, 4, K // (4 * 8 * 2 * 2 * mult), 2, 4, 8 // u8_kwidth, 2, u8_kwidth * mult) + b = len(batch) + perm = [0, 6, 1, 3, 2, 5, 4, 7] + perm = list(range(b)) + [b + p for p in perm] + data = data.permute(*perm) + data = data.reshape(*batch, M * 4, K // 4) + data = self._maybe_mT(data) + return data[..., :self.K, :self.N] + + def swizzle_block_shape(self, block_shape): + return block_shape + + +@triton.jit +def _unshuffle_triton(x, mma_version: tl.constexpr): + """ + Triton inverse of swizzle_mxfp4_value_hopper + """ + tl.static_assert(mma_version == 2 or mma_version == 3, "mma_version must be 2 or 3") + # if mx_axis == 0: + # x = x.trans() + + # We have two times the elements if we already upcasted to bfloat16 + mult: tl.constexpr = 2 if x.dtype == tl.bfloat16 else 1 + M: tl.constexpr = x.shape[0] + K: tl.constexpr = x.shape[1] + tl.static_assert(M % 4 == 0, "M must be divisible by 4") + tl.static_assert(K % (4 * 8 * 2 * 2 * mult) == 0, f"K must be divisible by {4 * 8 * 2 * 2 * mult}") + + # We are loading 8 bf16 elements per thread to use ld.global.v4 + # Every u8 represents 2 mxfp4 elements + u8_kwidth: tl.constexpr = 8 // 2 if mma_version == 2 else 1 + x = x.reshape(M // 4, 4, K // (4 * 8 * 2 * 2 * mult), 2, 4, 8 // u8_kwidth, 2, u8_kwidth * mult) + x = x.trans(0, 6, 1, 3, 2, 5, 4, 7) + x = x.reshape(M * 4, K // 4) + # if mx_axis == 0: + # x = x.trans() + return x + + +@triton.jit +def _unpack_fp4_to_bf16_triton(x): + # For now we implement just H100 support (mul.bf16x2) + # A100 support is possible via fma + r0, r1 = tl.inline_asm_elementwise( + r""" + { + .reg .b32 b, c, d<7>, scale; + .reg .b32 bias; + mov.b32 bias, 0x7e807e80; // 2 ** 126 == 2 ** (bias_bf16 - bias_fp2) + // We add the missing bias to the scale directly + and.b32 $0, $4, 0b10000001110000001000000111000000; + mul.bf16x2 $0, $0, bias; + shl.b32 b, $4, 3; + and.b32 $1, b, 0b10000001110000001000000111000000; + mul.bf16x2 $1, $1, bias; + shl.b32 c, $4, 6; + and.b32 $2, c, 0b10000001110000001000000111000000; + mul.bf16x2 $2, $2, bias; + // Unpack last two elements + shl.b32 d0, $4, 1; + and.b32 d1, d0, 0b10000000000000001000000000000000; + shr.b32 d2, $4, 3; + and.b32 d3, d2, 0b00000001100000000000000110000000; + or.b32 d4, d1, d3; + shr.b32 d5, $4, 7; + and.b32 d6, d5, 0b00000000010000000000000001000000; + or.b32 $3, d4, d6; + mul.bf16x2 $3, $3, bias; + } + """, + constraints="=r,=r,=r,=r,r", + args=[x], + dtype=(tl.bfloat16, tl.bfloat16), + is_pure=True, + pack=4, + ) + # Concat each pack of 4 + x = tl.join(r0, r1) + x = x.reshape(x.shape[0], x.shape[1] // 4, 4, x.shape[2]) + x = x.trans(0, 1, 3, 2) + x = x.reshape(x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]) + return x + + +@triton.jit +def mxfp4_to_bf16_triton(x, scale, mx_axis: tl.constexpr): + """ + Implements the bit-untwiddling of a 32-bit integer (8 mxfp4 elements): + (x << 0) & 0b1000000111000000 + (x << 3) & 0b1000000111000000 + (x << 6) & 0b1000000111000000 + ((x << 1) & 0b1000000000000000) | ((x >> 3) & 0b0000000110000000) | ((x >> 7) & 0b0000000001000000) + """ + # upcast values to bfloat16 + tl.static_assert(len(x.shape) == 2) + tl.static_assert(mx_axis == 0 or mx_axis == 1, "mx_axis must be 0 or 1") + tl.static_assert(x.shape[1] % 4 == 0) + tl.static_assert(x.dtype == tl.uint8) + if mx_axis == 0: + x = x.trans() + x = _unpack_fp4_to_bf16_triton(x) + x = _unshuffle_triton(x, mma_version=3) + if mx_axis == 0: + x = x.trans() + + # upcast scale to bfloat16 + # Add bias missing from the bf16 upcasting sequence + # triton / LLVM generates terrible code for this sequence + # scale = scale.to(tl.uint16) + # scale = scale << 7 + # scale = scale.to(tl.bfloat16, bitcast=True) + scale = tl.inline_asm_elementwise( + r""" + { + prmt.b32 $0, $2, 0, 0x5140; + shl.b32 $0, $0, 7; + prmt.b32 $1, $2, 0, 0x7362; + shl.b32 $1, $1, 7; + } + """, + constraints="=r,=r,r", + args=[scale], + dtype=tl.bfloat16, + is_pure=True, + pack=4, + ) + # Broadcast scale + scale = scale.expand_dims(mx_axis + 1) + scale = scale.broadcast_to(scale.shape[:mx_axis + 1] + [32] + scale.shape[mx_axis + 2:]) + scale = scale.reshape(x.shape) + + # Combine scale and x + x = x * scale + return x diff --git a/triton_kernels/tensor_details/layout_details/strided.py b/triton_kernels/tensor_details/layout_details/strided.py new file mode 100644 index 0000000000..9a8d643504 --- /dev/null +++ b/triton_kernels/tensor_details/layout_details/strided.py @@ -0,0 +1,21 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/tensor_details/layout_details/strided.py +# Triton is licensed under the MIT License. + +from .base import Layout + + +class StridedLayout(Layout): + name: str = None + + def __init__(self, shape) -> None: + super().__init__(shape) + + def swizzle_data(self, data): + return data + + def unswizzle_data(self, data): + return data + + def swizzle_block_shape(self, block_shape): + return block_shape diff --git a/triton_kernels/testing.py b/triton_kernels/testing.py new file mode 100644 index 0000000000..c5ef8f2dce --- /dev/null +++ b/triton_kernels/testing.py @@ -0,0 +1,199 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/testing.py +# Triton is licensed under the MIT License. + +import enum +import functools +import os +import subprocess +import sys +import torch +from triton_kernels.numerics import MAX_FINITE_FLOAT8E4B8, MAX_FINITE_FLOAT8E4NV, MAX_FINITE_FLOAT8E5 + + +def assert_equal(ref, tri): + if isinstance(ref, torch.Tensor): + assert torch.all(ref == tri) + else: + assert ref == tri + + +def assert_close(ref, tri, maxtol=None, rmstol=None, description="--", verbose=True): + if tri.dtype.itemsize == 1: + ref_as_type = ref.to(tri.dtype) + if ref.dtype == tri.dtype: + assert torch.all(ref_as_type == tri) + return + ref = ref_as_type + + if ref.numel() == 0: + return + + if maxtol is None: + maxtol = 2e-2 + if rmstol is None: + rmstol = 4e-3 + """ + Compare reference values against obtained values. + """ + + # cast to float32: + ref = ref.to(torch.float32).detach() + tri = tri.to(torch.float32).detach() + assert ref.shape == tri.shape, f"Tensors must have same size {ref.shape=} {tri.shape=}" + + # deal with infinite elements: + inf_mask_ref = torch.isinf(ref) + inf_mask_tri = torch.isinf(tri) + assert torch.equal(inf_mask_ref, inf_mask_tri), "Tensor must have same infinite elements" + refn = torch.where(inf_mask_ref, 0, ref) + trin = torch.where(inf_mask_tri, 0, tri) + + # normalise so that RMS calculation doesn't overflow: + eps = 1.0e-30 + multiplier = 1.0 / (torch.max(torch.abs(refn)) + eps) + refn *= multiplier + trin *= multiplier + + ref_rms = torch.sqrt(torch.square(refn).mean()) + eps + + rel_err = torch.abs(refn - trin) / torch.maximum(ref_rms, torch.abs(refn)) + max_err = torch.max(rel_err).item() + rms_err = torch.sqrt(torch.square(rel_err).mean()).item() + + if verbose: + print("%s maximum relative error = %s (threshold = %s)" % (description, max_err, maxtol)) + print("%s RMS relative error = %s (threshold = %s)" % (description, rms_err, rmstol)) + + if max_err > maxtol: + bad_idxs = torch.nonzero(rel_err > maxtol) + num_nonzero = bad_idxs.size(0) + bad_idxs = bad_idxs[:1000] + print("%d / %d mismatched elements (shape = %s) at coords %s" % + (num_nonzero, rel_err.numel(), tuple(rel_err.shape), bad_idxs.tolist())) + + bad_idxs = bad_idxs.unbind(-1) + print("ref values: ", ref[tuple(bad_idxs)].cpu()) + print("tri values: ", tri[tuple(bad_idxs)].cpu()) + + assert max_err <= maxtol + assert rms_err <= rmstol + + +class ComputeSanitizerTool(enum.Enum): + MEMCHECK = "memcheck" + RACECHECK = "racecheck" + SYNCCHECK = "synccheck" + INITCHECK = "initcheck" + + +def compute_sanitizer(**target_kwargs): + """ + Decorator to run a test with compute sanitizer enabled and pytorch caching allocator disabled, + to expose potential memory access errors. + This decorator requires the `request` fixture to be present. + If `run_sanitizer` argument is present and set to False, the sanitizer is not run. + Running tests under compute sanitizer requires launching subprocess and is slow, + so use sparingly + """ + + def decorator(test_fn): + + @functools.wraps(test_fn) + def wrapper(*args, **kwargs): + if os.environ.get("SKIP_COMPUTE_SANITIZER") == "1": + test_fn(*args, **kwargs) + return + + import psutil + + if target_kwargs.pop("clear_torch_cache", False): + # If we don't pop clear_torch_cache, it won't pass + # target_kwargs.items() <= kwargs.items() condition below. + torch.cuda.empty_cache() + tools_to_check = target_kwargs.pop("tools_to_check", [ComputeSanitizerTool.MEMCHECK]) + assert isinstance(tools_to_check, list), f"{tools_to_check=}" + assert all(tool in ComputeSanitizerTool for tool in tools_to_check), ( + f"{(tool for tool in tools_to_check if tool not in ComputeSanitizerTool)=}") + + ppid_name = psutil.Process(os.getppid()).exe() + run_compute_sanitizer = target_kwargs.items() <= kwargs.items() + if "run_sanitizer" in kwargs: + run_compute_sanitizer &= kwargs["run_sanitizer"] + if run_compute_sanitizer and "compute-sanitizer" not in ppid_name: + for tool in tools_to_check: + path = os.path.realpath(test_fn.__globals__["__file__"]) + # get path of current file + env = { + "PATH": os.environ["PATH"], + "PYTORCH_NO_CUDA_MEMORY_CACHING": "1", + "TORCH_SHOW_CPP_STACKTRACES": "1", + "CUDA_LAUNCH_BLOCKING": "1", + } + if "CUDA_VISIBLE_DEVICES" in os.environ: + env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"] + assert "request_fixture" in kwargs, ( + "memcheck'ed test must have a (possibly unused) `request` fixture") + test_id = kwargs["request_fixture"].node.callspec.id + cmd = f"{path}::{test_fn.__name__}[{test_id}]" + cmd = [ + "compute-sanitizer", + "--target-processes=application-only", + "--destroy-on-device-error=context", + f"--tool={tool.value}", + sys.executable, + "-m", + "pytest", + "-vsx", + cmd, + ] + for opt in ["--update_checksum", "--ignore_checksum_error"]: + if opt in sys.argv: + cmd.append(opt) + out = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + ) + sanitizer_ok = "ERROR SUMMARY: 0 errors" in str( + out.stdout) or "RACECHECK SUMMARY: 0 hazards displayed" in str(out.stdout) + test_output = out.stdout + if type(test_output) is bytes: + test_output = test_output.decode() + + fail = False + if not sanitizer_ok: + print("compute-sanitizer returned an error") + fail = True + elif out.returncode != 0: + print( + "The test failed due to some other reason: consider running without compute-sanitizer to verify." + ) + print(f"{out.returncode=}") + fail = True + + if fail: + print("*****************************************************") + print("******************** TEST OUTPUT ********************") + print("*****************************************************") + print(test_output) + print("*****************************************************") + print("****************** TEST OUTPUT END ******************") + print("*****************************************************") + assert None + else: + test_fn(*args, **kwargs) + + return wrapper + + return decorator + + +def compute_actual_scale(x, dtype): + max_finite = { + torch.float8_e5m2: MAX_FINITE_FLOAT8E5, + torch.float8_e4m3fn: MAX_FINITE_FLOAT8E4NV, + torch.float8_e4m3fnuz: MAX_FINITE_FLOAT8E4B8, + }[dtype] + return x.abs().max() / max_finite diff --git a/triton_kernels/topk.py b/triton_kernels/topk.py new file mode 100644 index 0000000000..22140cf41a --- /dev/null +++ b/triton_kernels/topk.py @@ -0,0 +1,128 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/topk.py +# Triton is licensed under the MIT License. + +import torch +import triton +from triton_kernels.topk_details._topk_forward import _topk_forward +from triton_kernels.topk_details._topk_backward import _topk_backward +from triton_kernels.tensor import Tensor, Bitmatrix +from typing import Optional, Union + + +def topk_forward(x, k, apply_softmax=True, dim=1, return_bitmatrix=True, y_indx=None, n_rows=None): + if not isinstance(x, Tensor): + x_shape = [x.shape[0] if n_rows is None else n_rows, x.shape[1]] + x_shape_max = [x.shape[0], x.shape[1]] + x = Tensor(x, shape=x_shape, shape_max=x_shape_max) + cdiv = lambda a, b: (a + b - 1) // b + BLOCK_M = 32 + BLOCK_N = 32 + BLOCK_S = 128 + assert len(x.shape) == 2 + assert x.shape_max[-1] < 32768 + assert dim == 1 + assert return_bitmatrix + n_rows, n_cols = x.shape + n_rows_max, _ = x.shape_max + dev = x.device + # scratchpad tensors + # NOTE: these are not returned + y_vals = torch.empty((n_rows_max, k), dtype=x.dtype, device=dev) + if y_indx is not None: + use_provided_indx = True + else: + y_indx = torch.empty((n_rows_max, k), dtype=torch.int16, device=dev) + use_provided_indx = False + # create bitmatrix in transposed memory layout: + n_cols_pad = cdiv(n_cols, BLOCK_N) * BLOCK_N + n_cols_words = n_cols_pad // 32 + bitmatrix = torch.empty((n_cols_words, cdiv(n_rows_max, 32) * 32), dtype=torch.uint32, device=dev) + bitmatrix = torch.transpose(bitmatrix, 0, 1)[:n_rows_max] + s_blocks = cdiv(n_cols, BLOCK_S) + s_cols = s_blocks * BLOCK_S + scratchpad = torch.empty((s_cols, ), dtype=torch.int32, device=dev) + pids = max(cdiv(n_rows_max, BLOCK_M), s_blocks) + _topk_forward[(pids, )]( + x, x.stride(0), # inputs + y_vals, y_indx, y_vals.stride(0), use_provided_indx, # output [topk] + bitmatrix, bitmatrix.stride(0), bitmatrix.stride(1), # output [bitmatrix] + n_rows, n_cols, # shapes + scratchpad, BLOCK_S, s_blocks, # thing to memset to zero + BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, # tunable parameter + APPLY_SOFTMAX=apply_softmax, N_EXPTS_PAD=n_cols_pad, N_EXPTS_ACT=k, # constants + ) + bitmatrix_shape = [n_rows, n_cols_words * 32] + bitmatrix_shape_max = [n_rows_max, None] + bitmatrix = Bitmatrix(bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=scratchpad) + return y_vals, y_indx, bitmatrix + + +def topk_backward(x, y_indx, dy_vals, k, n_rows, apply_softmax): + assert dy_vals.shape[-1] == k + n_expts_pad = triton.next_power_of_2(x.shape[-1]) + dx = torch.empty_like(x) + _topk_backward[(dy_vals.shape[0], )]( + y_indx, y_indx.stride(0), dy_vals, dy_vals.stride(0), x, x.stride(0), # inputs + dx, # outputs + dx.stride(0), x.shape[0], n_rows, x.shape[-1], APPLY_SOFTMAX=apply_softmax, N_EXPTS_ACT=k, + N_EXPTS_PAD=n_expts_pad) + return dx + + +class TopK(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, k, apply_softmax, dim, return_bitmatrix, y_indx, n_rows): + y_vals, y_indx, bitmatrix = topk_forward(x, k, apply_softmax, dim, return_bitmatrix, y_indx, n_rows) + ctx.save_for_backward(x, y_indx) + ctx.apply_softmax = apply_softmax + ctx.k = k + ctx.n_rows = n_rows + return y_vals, y_indx, bitmatrix + + @staticmethod + def backward(ctx, dy_vals, _0, _1): + x, y_indx = ctx.saved_tensors + dx = topk_backward(x, y_indx, dy_vals, ctx.k, ctx.n_rows, ctx.apply_softmax) + return dx, None, None, None, None, None, None + + +def topk( + x: Union[Tensor, torch.Tensor], + k: int, + apply_softmax: bool = True, + dim: int = 1, + return_bitmatrix: bool = True, + y_indx: Optional[torch.Tensor] = None, + n_rows: Optional[int] = None, +): + """ + Computes the top-k values and indices along a specified dimension of a tensor. + Note that the input can be either a `Tensor` or a `torch.Tensor`, but the output will always be a `torch.Tensor`. + + Parameters + ---------- + x : Union[triton_kernels.Tensor, torch.Tensor] + Input tensor of shape (n_tokens, n_expts). + k : int + Number of top elements to retrieve. + apply_softmax : bool, default True + Whether to apply softmax to the input tensor before computing top-k. + dim : int, default 1 + Dimension along which to compute top-k. + return_bitmatrix : bool, default True + A bitmatrix of shape (n_tokens, cdiv(n_expts, 32)). + Each bit on [t, b] indicates whether the b-th expert was selected for the t-th token. + y_indx : torch.Tensor, optional + Pre-allocated tensor for storing indices of top-k elements with shape (n_tokens, k). + If provided, we skip the computation of top-k indices and use this tensor instead. + n_rows : int, optional + Number of rows to apply top-k on. If None, we consider all rows in `x`. + + Returns + ------- + (expt_scal, expt_indx, bitmatrix) : Tuple[torch.Tensor, torch.Tensor, Bitmatrix] + """ + ret = TopK.apply(x, k, apply_softmax, dim, return_bitmatrix, y_indx, n_rows) + return ret diff --git a/triton_kernels/topk_details/__init__.py b/triton_kernels/topk_details/__init__.py new file mode 100644 index 0000000000..6a41f2b81a --- /dev/null +++ b/triton_kernels/topk_details/__init__.py @@ -0,0 +1,3 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/topk_details/__init__.py +# Triton is licensed under the MIT License. diff --git a/triton_kernels/topk_details/_topk_backward.py b/triton_kernels/topk_details/_topk_backward.py new file mode 100644 index 0000000000..f03166fb9b --- /dev/null +++ b/triton_kernels/topk_details/_topk_backward.py @@ -0,0 +1,55 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/topk_details/_topk_backward.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + + +@triton.jit +def _topk_backward( + Yi, + stride_ym, # topk indices + DY, + stride_dym, # output gradient values + X, + stride_xm, # input values + DX, + stride_dxm, # input gradient values + n_rows, + NRows, + n_expts_tot, + APPLY_SOFTMAX: tl.constexpr, + N_EXPTS_ACT: tl.constexpr, + N_EXPTS_PAD: tl.constexpr, +): + pid_m = tl.program_id(0) + if NRows is not None: + n_rows = tl.load(NRows) + if pid_m >= n_rows: + return + Yi += pid_m * stride_ym + DY += pid_m * stride_dym + X += pid_m * stride_xm + DX += pid_m * stride_dxm + # -- + offs_xn = tl.arange(0, N_EXPTS_PAD) + offs_yn = tl.arange(0, N_EXPTS_ACT) + mask_xn = offs_xn < n_expts_tot + # recompute softmax + y_indx = tl.load(Yi + offs_yn) + x = tl.load(X + y_indx) + x = x.to(tl.float32) + y = tl.softmax(x) + # compute input-gradient + dy = tl.load(DY + offs_yn) + dy = dy.to(tl.float32) + s = tl.sum(y * dy, 0) + # write-back input gradient + tl.store(DX + offs_xn, 0, mask=mask_xn) + tl.debug_barrier() + if APPLY_SOFTMAX: + dx = y * (dy - s) + else: + dx = dy + tl.store(DX + y_indx, dx) diff --git a/triton_kernels/topk_details/_topk_forward.py b/triton_kernels/topk_details/_topk_forward.py new file mode 100644 index 0000000000..4ccdbd2083 --- /dev/null +++ b/triton_kernels/topk_details/_topk_forward.py @@ -0,0 +1,150 @@ +# This file is vendored from the Triton project. DO NOT EDIT THIS FILE DIRECTLY. +# Source: https://github.com/triton-lang/triton/tree/v3.5.1/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py +# Triton is licensed under the MIT License. + +import triton +import triton.language as tl + + +@triton.jit +def get_topmask_and_fullmask(x): + tl.static_assert(x.dtype.is_int_unsigned(), "floating-point value must be passed as bits") + tm: tl.constexpr = 1 << (-1 + x.dtype.primitive_bitwidth) + fm: tl.constexpr = (1 << x.dtype.primitive_bitwidth) - 1 + tm_arr = tl.full(x.shape, tm, dtype=x.dtype) + fm_arr = tl.full(x.shape, fm, dtype=x.dtype) + return tm_arr, fm_arr + + +@triton.jit +def fpval_to_key(x): + tm, fm = get_topmask_and_fullmask(x) + return x ^ tl.where((x & tm) != 0, fm, tm) + + +@triton.jit +def key_to_fpval(x): + tm, fm = get_topmask_and_fullmask(x) + return x ^ tl.where((x & tm) == 0, fm, tm) + + +# stable top-k tie-breaks to value with smaller index +@triton.jit +def indx_to_key(indx, N_EXPTS_PAD: tl.constexpr): + return N_EXPTS_PAD - indx + + +@triton.jit +def key_to_indx(indx, N_EXPTS_PAD: tl.constexpr): + return N_EXPTS_PAD - indx + + +@triton.jit +def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, + BLOCK_N: tl.constexpr): + x_nbits: tl.constexpr = X.dtype.element_ty.primitive_bitwidth + x_utype: tl.constexpr = tl.dtype(f"uint{x_nbits}") + if x_nbits < 16: + # this ensures that we leave at least 16 bits for expert index + # even if the input dtype is smaller than 16 bits: + y_nbits: tl.constexpr = 32 + else: + y_nbits: tl.constexpr = x_nbits * 2 + x_ultype: tl.constexpr = tl.dtype(f"uint{y_nbits}") + x_dtype: tl.constexpr = X.dtype.element_ty + + # subtract 1 from loop iterations because we peel the first (masked) iteration: + loop_iterations: tl.constexpr = N_EXPTS_PAD // BLOCK_N - 1 + offs_x_n = loop_iterations * BLOCK_N + tl.arange(0, BLOCK_N) + mask_n = offs_x_n[None, :] < n_expts_tot + + # first iteration: + X_ptrs = X + offs_m[:, None] * stride_xm + offs_x_n[None, :] + x = tl.load(X_ptrs, mask=(mask_m & mask_n), other=float("-inf")) + x = fpval_to_key(x.to(x_utype, bitcast=True)) + x = (x.to(x_ultype) << 16) | indx_to_key(offs_x_n, N_EXPTS_PAD)[None, :] + acc = tl.topk(x, N_EXPTS_ACT, dim=1) + + # subsequent iterations: + for _i in (tl.static_range if loop_iterations <= 4 else range)(loop_iterations): + acc = tl.bitonic_merge(acc) # ensure sorted ascending for the merge + X_ptrs -= BLOCK_N + offs_x_n -= BLOCK_N + x = tl.load(X_ptrs, mask=mask_m, other=float("-inf")) + x = fpval_to_key(x.to(x_utype, bitcast=True)) + x = (x.to(x_ultype) << 16) | indx_to_key(offs_x_n, N_EXPTS_PAD)[None, :] + acc = tl.maximum(acc, tl.topk(x, N_EXPTS_ACT, dim=1)) + + # rotate expert index into upper 16 bits: + # 0000vvvvvvvviiii --> iiii0000vvvvvvvv + acc = (acc << (y_nbits - 16)) | (acc >> 16) + # sort in ascending order of expert (descending order of key) + acc = tl.sort(acc, dim=1, descending=True) + # iiii0000vvvvvvvv --> 0000iiii: + y_indices_raw = (acc >> (y_nbits - 16)).to(tl.uint32) + y_indices = key_to_indx(y_indices_raw, N_EXPTS_PAD) + # iiii0000vvvvvvvv --> vvvvvvvv: + y_values_raw = acc.to(x_utype) + y_values = key_to_fpval(y_values_raw).to(x_dtype, bitcast=True) + + return y_values, y_indices + + +@triton.jit +def _topk_forward(X, stride_xm, # inputs + Yv, Yi, stride_ym, # topk values/indices + USE_PROVIDED_INDX: tl.constexpr, Bits, stride_rm: tl.constexpr, stride_rn: tl.constexpr, # bitmatrix + n_rows, n_expts_tot, # shape + S, BLOCK_S: tl.constexpr, s_blocks, # thing to memset + APPLY_SOFTMAX: tl.constexpr, # constant + BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, BLOCK_N: tl.constexpr): + + pid = tl.program_id(0) + if isinstance(n_rows, tl.tensor) and n_rows.dtype.is_ptr(): + n_rows = tl.load(n_rows) + + if pid < s_blocks: + tl.store(S + BLOCK_S * pid + tl.arange(0, BLOCK_S), tl.zeros([BLOCK_S], tl.int32)) + + if pid * BLOCK_M >= n_rows: + # early exit: + return + + tl.static_assert(BLOCK_N % 32 == 0) + tl.static_assert(N_EXPTS_PAD % BLOCK_N == 0) + x_dtype: tl.constexpr = X.dtype.element_ty + + # load logits + offs_m = pid * BLOCK_M + tl.arange(0, BLOCK_M) + offs_y_n = tl.arange(0, N_EXPTS_ACT) + mask_m = offs_m[:, None] < n_rows + if USE_PROVIDED_INDX: + Yi_ptrs = Yi + offs_m[:, None] * stride_ym + offs_y_n[None, :] + y_indices = tl.load(Yi_ptrs, mask=mask_m) + Xv_ptrs = X + offs_m[:, None] * stride_xm + y_indices + y_values = tl.load(Xv_ptrs, mask=mask_m) + else: + y_values, y_indices = streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, # + N_EXPTS_PAD, N_EXPTS_ACT, BLOCK_N) + + # normalize selected values + if APPLY_SOFTMAX: + y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype) + + # write back + Yv_ptrs = Yv + offs_m[:, None] * stride_ym + offs_y_n[None, :] + tl.store(Yv_ptrs, y_values, mask=mask_m) + if not USE_PROVIDED_INDX: + Yi_ptrs = Yi + offs_m[:, None] * stride_ym + offs_y_n[None, :] + tl.store(Yi_ptrs, y_indices, mask=mask_m) + + # pack into bitmatrix + y_div = y_indices // 32 + y_rem = y_indices % 32 + loop_iterations = N_EXPTS_PAD // BLOCK_N + for i in range(loop_iterations): + offs_r_n = tl.arange(0, BLOCK_N // 32) + i * (BLOCK_N // 32) + y2 = tl.where(y_div[:, :, None] == offs_r_n[None, None, :], (1 << y_rem)[:, :, None], 0) + r = tl.reduce_or(y2, axis=1) + BitsPtrs = Bits + offs_m[:, None] * stride_rm + offs_r_n[None, :] * stride_rn + tl.store(BitsPtrs, r, mask=mask_m)