TensorRT-LLMs/scripts/gen_cuda_headers_for_xqa.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
#
# Embed system CUDA headers in c++ arries.

import argparse
import os
from collections import namedtuple
from pathlib import Path

parser = argparse.ArgumentParser(
    description='Embed system CUDA headers in cpp arries',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--output_file', help='Output c++ file name', required=True)
parser.add_argument(
    '--input_files',
    help='Input CUDA header file name list, separated by ","',
    default=
    'cuda_bf16.h,cuda_bf16.hpp,cuda_fp16.h,cuda_fp16.hpp,cuda_fp8.h,cuda_fp8.hpp,vector_types.h,vector_functions.h'
)
parser.add_argument('--cuda_root',
                    help='CUDA Toolkit path',
                    default='/usr/local/cuda')
parser.add_argument(
    '--chunk-size',
    type=int,
    help=
    'Max length for each literal string in the output. Strings would be split into multiple smaller substrings if the length exceeds chunk-size.',
    default=80)

args = parser.parse_args()

TEMPLATE_PROLOGUE = '''/*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
 *
 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 * property and proprietary rights in and to this material, related
 * documentation and any modifications thereto. Any use, reproduction,
 * disclosure or distribution of this material and related documentation
 * without an express license agreement from NVIDIA CORPORATION or
 * its affiliates is strictly prohibited.
 */

#pragma once
namespace tensorrt_llm {
namespace kernels {
'''

# Prepend the magic string to disable NVRTC encryption.
TEMPLATE_CONTENT = '''constexpr const char* {content_var_name} = "j3iAA#$)7"{content};
constexpr const char* {fname_var_name} = "{fname}";
'''

TEMPLATE_EPILOGUE = '''}
}
'''


# Input: "ThisIsAString.h" / "this_is_a_string.h"
# Output: "this_is_a_string_h"
def get_canonized_str(s: str):
    tokens = []
    n = len(s)
    i = 0
    while i < n and not s[i].isalpha() and not s[i].isdigit():
        i += 1
    while i < n:
        j = i + 1
        while j < n and (s[j].islower() or s[j].isdigit()):
            j += 1
        tokens.append(s[i:j].lower())
        while j < n and not s[j].isalpha() and not s[j].isdigit():
            j += 1
        i = j
    return '_'.join(tokens)


# Returned string includes the surrounding double quotation marks.
def convert_to_cpp_raw_str(s: str):
    chunk_size = args.chunk_size
    if len(s) <= chunk_size:

        def stringify(x: bytes):
            return "\\" + format(x, "03o")

        b = bytes(s, 'utf-8')
        return '"' + ''.join(map(stringify, b)) + '"'
    else:
        string_array = []
        i = 0
        while i < len(s):
            string_array.append(s[i:i + chunk_size])
            i += chunk_size
        return '\n'.join(map(convert_to_cpp_raw_str, string_array))


Entry = namedtuple('Entry', ['content_var_name', 'fname_var_name'])
entries = []

output_content = ''
output_content += TEMPLATE_PROLOGUE
for input_file in args.input_files.split(','):
    fname_var_name = get_canonized_str(input_file) + '_fname'
    content_var_name = get_canonized_str(input_file) + '_content'
    input_full_path = os.path.join(args.cuda_root, 'include', input_file)
    with open(input_full_path, 'r') as f:
        input_content = f.read()
    output_content += TEMPLATE_CONTENT.format(
        content_var_name=content_var_name,
        content=convert_to_cpp_raw_str(input_content),
        fname_var_name=fname_var_name,
        fname=input_file)
    entries.append(
        Entry(content_var_name=content_var_name, fname_var_name=fname_var_name))

output_content += "constexpr char const* cuda_headers_content[] = {\n"
for entry in entries:
    output_content += "    " + entry.content_var_name + ",\n"
output_content += "};\n"

output_content += "constexpr char const* cuda_headers_name[] = {\n"
for entry in entries:
    output_content += "    " + entry.fname_var_name + ",\n"
output_content += "};\n"

output_content += TEMPLATE_EPILOGUE

output_dir = os.path.dirname(args.output_file)
Path(output_dir).mkdir(parents=True, exist_ok=True)

with open(args.output_file, 'w') as f:
    f.write(output_content)