[None][fix] Introduce inline namespace to avoid symbol collision (#9541)

Signed-off-by: Yihan Wang <yihwang@nvidia.com>
This commit is contained in:
Yihan Wang 2025-12-12 23:32:15 +08:00 committed by GitHub
parent af315d8ef1
commit 9df4dad3b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
621 changed files with 4168 additions and 9576 deletions

2
.gitattributes vendored
View File

@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text
cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text

1
.gitignore vendored
View File

@ -74,6 +74,7 @@ llm-test-workspace/
cpp/include/tensorrt_llm/executor/version.h
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
.devcontainer/.env
/examples/layer_wise_benchmarks/profiles/

View File

@ -1,6 +1,7 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
*AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -17,13 +18,16 @@
*/
#include "utils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include <random>
#include <filesystem>
#include <fstream>
namespace tensorrt_llm::benchmark
TRTLLM_NAMESPACE_BEGIN
namespace benchmark
{
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
if (samples.size() < maxNumSamples)
{
TLLM_LOG_WARNING(
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
"Dataset size %zu is smaller than given max_num_samples "
"%d, max_num_samples will be ignored.\n",
samples.size(), maxNumSamples);
}
return samples;
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
return os;
}
} // namespace tensorrt_llm::benchmark
} // namespace benchmark
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/executor/executor.h"
#include <cstdint>
@ -29,7 +30,9 @@
#pragma once
namespace tensorrt_llm::benchmark
TRTLLM_NAMESPACE_BEGIN
namespace benchmark
{
// using namespace tensorrt_llm::batch_manager;
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
} // namespace tensorrt_llm::benchmark
} // namespace benchmark
TRTLLM_NAMESPACE_END

View File

@ -16,8 +16,9 @@
#pragma once
namespace tensorrt_llm
{
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
// Base class for algorithms
struct Algorithm
@ -29,4 +30,4 @@ struct Algorithm
Algorithm& operator=(Algorithm const&) = delete;
};
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,9 +17,13 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cstdint>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
//!
@ -100,4 +104,6 @@ private:
size_type mSize;
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,14 +16,19 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h"
TRTLLM_NAMESPACE_BEGIN
class DebugConfig
{
public:
static bool isCheckDebugEnabled();
};
TRTLLM_NAMESPACE_END
#if defined(_WIN32)
#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
@ -35,8 +40,8 @@ public:
#define TLLM_CHECK(val) \
do \
{ \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} while (0)
#define TLLM_CHECK_WITH_INFO(val, info, ...) \
@ -51,17 +56,17 @@ public:
#define TLLM_CHECK_DEBUG(val) \
do \
{ \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
{ \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} \
} while (0)
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
do \
{ \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
{ \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) \

View File

@ -17,9 +17,13 @@
#pragma once
#include "c10/util/intrusive_ptr.h"
#include "tensorrt_llm/common/config.h"
#include <Python.h>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// Adapted from pybind11's example implementation:
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
return c10::intrusive_ptr<T>::reclaim_copy(p);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef TRTLLM_CONFIG_H
#define TRTLLM_CONFIG_H
/**
* \def TRTLLM_ABI_NAMESPACE
* This macro is used to open an implicitly inline namespace block for the ABI version.
* This macro can be overridden to change the ABI version.
* The default ABI version is _v1.
*/
#ifndef TRTLLM_ABI_NAMESPACE
#define TRTLLM_ABI_NAMESPACE _v1
#endif
#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
#define TRTLLM_ABI_NAMESPACE_BEGIN \
inline namespace TRTLLM_ABI_NAMESPACE \
{
#endif
#ifndef TRTLLM_ABI_NAMESPACE_END
#define TRTLLM_ABI_NAMESPACE_END }
#endif
/**
* \def TRTLLM_NAMESPACE_BEGIN
* This macro is used to open a `tensorrt_llm::` namespace block, along with any
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
* This macro is defined by TensorRT-LLM and may not be overridden.
*/
#define TRTLLM_NAMESPACE_BEGIN \
namespace tensorrt_llm \
{ \
TRTLLM_ABI_NAMESPACE_BEGIN
/**
* \def TRTLLM_NAMESPACE_END
* This macro is used to close a `tensorrt_llm::` namespace block, along with any
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
* This macro is defined by TensorRT-LLM and may not be overridden.
*/
#define TRTLLM_NAMESPACE_END \
TRTLLM_ABI_NAMESPACE_END \
} /* end namespace tensorrt_llm */
#endif // TRTLLM_CONFIG_H

View File

@ -16,6 +16,8 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#include <cuda_runtime.h>
@ -29,8 +31,8 @@
#define USE_QGMMA
#endif
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
#endif // ENABLE_FP8

View File

@ -14,12 +14,18 @@
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstdint>
#include <optional>
#include <string>
#include <unordered_set>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/// @brief Populate the start and end profiling iteration indexes from the provided environment variables
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaDriverWrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
@ -49,7 +50,9 @@
// this undef.
#endif // WIN32
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// workspace for cublas gemm : 32MB
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
DEFINE_MEMBER_CHECKER(qua)
DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END
/*
* Macros compliant with TensorRT coding conventions

View File

@ -16,11 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h"
#include <NvInferRuntime.h>
#include <map>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
constexpr static size_t getDTypeSize(nvinfer1::DataType type)
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
return "";
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -22,9 +22,12 @@
#include <string>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
class Logger
@ -125,12 +128,12 @@ private:
static inline std::string getPrefix(Level const level)
{
return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
}
static inline std::string getPrefix(Level const level, int const rank)
{
return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
}
};
@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
out << std::endl;
}
}
} // namespace common
TRTLLM_NAMESPACE_END
#define TLLM_LOG(level, ...) \
do \
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
} // namespace tensorrt_llm::common

View File

@ -16,11 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <functional>
#include <memory>
#include <optional>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/**
@ -100,4 +104,6 @@ public:
}
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,14 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstdint>
#include <optional>
#include <string>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -480,4 +482,5 @@ public:
};
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#if ENABLE_BF16
#include <cuda_bf16.h>
#endif // ENABLE_BF16
@ -28,7 +29,9 @@
#include <unordered_set>
#include <vector>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
#if ENABLE_BF16
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
}
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <array>
@ -41,7 +42,9 @@
tensorrt_llm::common::RequestSpecificException( \
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/// @brief Enumeration of different error codes for request-specific exceptions
@ -77,7 +80,8 @@ private:
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
{
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
throw TllmException(
file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
}
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
@ -102,4 +106,6 @@ private:
RequestErrorCode mErrorCode;
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,8 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <algorithm>
#include <initializer_list>
#include <string>
@ -24,7 +26,9 @@
#include <pthread.h>
#endif
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
inline bool setThreadName(std::string const& name)
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
return std::find(c.begin(), c.end(), v) != c.end();
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,7 +16,11 @@
#pragma once
namespace tensorrt_llm::kernels
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
namespace detail
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;
} // namespace arch
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,11 +17,14 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/executor/types.h"
#include <cstdint>
#include <curand_kernel.h>
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
class FinishedState
@ -308,4 +311,6 @@ template <typename T>
void invokeScatterDecodingParams(
T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,11 +17,14 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cstdint>
#include <cuda_runtime.h>
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
class KVCacheIndex
@ -53,4 +56,6 @@ private:
UnderlyingType value;
};
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -14,16 +14,18 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/runtime/iBuffer.h"
using namespace tensorrt_llm::runtime;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -2175,7 +2175,8 @@ def get_kernel_code(kspec, kname, lname):
params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ''
include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
@ -2201,8 +2202,19 @@ def get_kernel_code(kspec, kname, lname):
const int COMPUTE_REG_COUNT = {compute_reg_count};
asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
compute_reg_count=compute_reg_count)
local_ns_open = ns_open if generate_cu_trtllm else ''
local_ns_close = ns_close if generate_cu_trtllm else ''
abi_ns_open = r"""
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
// clang-format off
"""
abi_ns_close = r"""
// clang-format on
} // namespace kernels
TRTLLM_NAMESPACE_END
"""
local_ns_open = abi_ns_open if generate_cu_trtllm else ''
local_ns_close = abi_ns_close if generate_cu_trtllm else ''
tmp = dict(locals(), **kspec._asdict())
@ -3077,8 +3089,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
def get_cubin_header(kernel_traits, specs_names):
cubins = []
cubin_lens = []
launchers = []
cubins_dict = {}
cubin_lens_dict = {}
launchers_dict = {}
for kspec, fname, lname, kname in specs_names:
if generate_cu_trtllm and not use_cubin_header(
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
@ -3282,11 +3296,11 @@ def get_cubin_header(kernel_traits, specs_names):
if generate_cu_trtllm and lname != 'nullptr':
launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
lname=lname)
if int(sm) in cubins_dict:
if launcher not in cubins_dict[int(sm)]:
cubins_dict[int(sm)].append(launcher)
if int(sm) in launchers_dict:
if launcher not in launchers_dict[int(sm)]:
launchers_dict[int(sm)].append(launcher)
else:
cubins_dict[int(sm)] = [launcher]
launchers_dict[int(sm)] = [launcher]
elif 'mhca' in kname:
code = '''\
{{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
@ -3309,17 +3323,33 @@ def get_cubin_header(kernel_traits, specs_names):
else:
metadata_v2 = ',\n'.join(metadata_v2)
# Add macros to only include needed cubins during compilation.
for sm in cubins_dict.keys():
# Collect all SM versions from all dictionaries
all_sms = sorted(
set(
list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
list(launchers_dict.keys())))
for sm in all_sms:
macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
macro_end = f"#endif\n"
# Add cubin array declarations
if sm in cubins_dict:
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
# Add cubin length declarations
if sm in cubin_lens_dict:
cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
# Add launcher declarations
if sm in launchers_dict:
launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
unroll_config_v1 = ',\n'.join(unroll_config_v1)
unroll_config_v2 = ',\n'.join(unroll_config_v2)
cubins = '\n'.join(cubins)
cubin_lens = '\n'.join(cubin_lens)
launchers = '\n'.join(launchers)
local_ns_open = ns_open
local_ns_close = ns_close if generate_cu_trtllm else '}'
launcher_line = '''
@ -3431,7 +3461,157 @@ static const struct TestMetaV2
'''.format(**locals(), copyright=copyright)
return code
# Generate header content (.h file)
if "GENERATE_CUBIN" in os.environ:
header_content = '''\
{copyright}
#pragma once
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels{{
struct FusedMultiHeadAttentionKernelMetaInfoV2
{{
Data_type mDataTypeIn;
Data_type mDataTypeOut;
unsigned int mS;
unsigned int mStepQ;
unsigned int mStepKV;
unsigned int mD;
unsigned int mDV;
unsigned int mSageBlockSizeQ;
unsigned int mSageBlockSizeK;
unsigned int mSageBlockSizeV;
unsigned int mSM;
const unsigned char* mCubin;
unsigned int mCubinSize;
const char* mFuncName;
unsigned int mSharedMemBytes;
unsigned int mThreadsPerCTA;
unsigned int mUnrollStep;
int mAttentionMaskType;
int mAttentionInputLayout;
bool mInterleaved;
bool mFlashAttention;
bool mWarpSpecialization;
bool mFP32Accumulation;
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
extern const int sMhaKernelMetaInfosV2Size;
}} // namespace kernels
TRTLLM_NAMESPACE_END
'''.format(**locals(), copyright=copyright)
# Generate source content (.cpp file)
source_content = '''\
{copyright}
#include "tensorrt_llm/common/config.h"
#include <cstddef>
#include <cstdint>
#include <cuda_runtime_api.h>
{local_ns_open}
//--- Cubin Arrays
{cubins}
//--- Cubin Lengths
{cubin_lens}
{local_ns_close}
using namespace tensorrt_llm::kernels;
namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
class Fused_multihead_attention_params_v2;
class Launch_params;
//--- Kernel Launchers
{launchers}
// FIXME: These are duplicated declarations, we should remove them in the future.
constexpr int32_t kSM_70 = 70;
constexpr int32_t kSM_72 = 72;
constexpr int32_t kSM_75 = 75;
constexpr int32_t kSM_80 = 80;
constexpr int32_t kSM_86 = 86;
constexpr int32_t kSM_89 = 89;
constexpr int32_t kSM_90 = 90;
constexpr int32_t kSM_100 = 100;
constexpr int32_t kSM_100f = 10100;
constexpr int32_t kSM_103 = 103;
constexpr int32_t kSM_120 = 120;
constexpr int32_t kSM_121 = 121;
// FIXME: These are duplicated declarations, we should remove them in the future.
enum Data_type
{{
DATA_TYPE_BOOL,
DATA_TYPE_FP16,
DATA_TYPE_FP32,
DATA_TYPE_INT4,
DATA_TYPE_INT8,
DATA_TYPE_INT32,
DATA_TYPE_BF16,
DATA_TYPE_E2M1,
DATA_TYPE_E4M3,
DATA_TYPE_E5M2
}};
struct FusedMultiHeadAttentionKernelMetaInfoV2
{{
Data_type mDataTypeIn;
Data_type mDataTypeOut;
unsigned int mS;
unsigned int mStepQ;
unsigned int mStepKV;
unsigned int mD;
unsigned int mDV;
unsigned int mSageBlockSizeQ;
unsigned int mSageBlockSizeK;
unsigned int mSageBlockSizeV;
unsigned int mSM;
const unsigned char* mCubin;
unsigned int mCubinSize;
const char* mFuncName;
unsigned int mSharedMemBytes;
unsigned int mThreadsPerCTA;
unsigned int mUnrollStep;
int mAttentionMaskType;
int mAttentionInputLayout;
bool mInterleaved;
bool mFlashAttention;
bool mWarpSpecialization;
bool mFP32Accumulation;
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
{metadata_v2}
}};
extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
'''.format(**locals(), copyright=copyright)
else:
# Non-GENERATE_CUBIN mode: use old behavior
header_content = code
source_content = None
return header_content, source_content
# This is used to add some kernels running in cubins for passing CI cases.
@ -3449,9 +3629,20 @@ def modify_cubin_header(cubin_header):
return result
target = "#ifndef EXCLUDE_SM_80"
addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
result = add_kernel_line(result, target, addition)
addition_cubin_array = """
#ifndef EXCLUDE_SM_80
extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
#endif
"""
addition_cubin_length = """
#ifndef EXCLUDE_SM_80
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
#endif
"""
# Add cubin array and length into there corresponding sections.
result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
result = add_kernel_line(result, "//--- Cubin Lengths",
addition_cubin_length)
def modify_kernel_line(result, target, new_line):
lines = result.split('\n')
@ -3534,13 +3725,22 @@ def generate_files(specs_names):
output = output.decode('utf-8').strip()
# this gives: kname, smem bytes, threads_per_cta, loop_step
kernel_traits = [traits.split() for traits in output.splitlines()]
cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
# Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
# To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
cubin_header, cubin_source = get_cubin_header(kernel_traits,
valid_specs_names)
if generate_cu_trtllm:
cubin_header = modify_cubin_header(cubin_header)
cubin_source = modify_cubin_header(cubin_source)
# Write fmha_cubin.h file
with open('./generated/fmha_cubin.h', 'w') as f:
f.write(cubin_header)
# Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
if cubin_source is not None:
with open('./generated/fmha_cubin.cpp', 'w') as f:
f.write(cubin_source)
def enumerate_hgmma_tma_kernels(specs, sm=90):
specs.append(

View File

@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
*/
#pragma once
namespace tensorrt_llm {
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels {
'''
@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
'''
TEMPLATE_EPILOGUE = '''}
}
TRTLLM_NAMESPACE_END
'''
D = defaultdict(list)

View File

@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
* See the License for the specific language governing permissions and
* limitations under the License.
*/
namespace tensorrt_llm
{
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
// clang-format off
@ -96,7 +98,7 @@ namespace kernels
cpp_file_suffex_text = R"""
// clang-format on
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
"""
cubin_meta_info_struct_prefix_text = R"""

View File

@ -27,7 +27,7 @@ bool initCheckDebug()
}
} // namespace
bool DebugConfig::isCheckDebugEnabled()
bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
{
static bool const debugEnabled = initCheckDebug();
return debugEnabled;

View File

@ -16,6 +16,7 @@
*/
#include "attentionOp.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"

View File

@ -16,6 +16,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/opUtils.h"
#include "tensorrt_llm/common/quantization.h"
@ -36,7 +37,9 @@
#include <nccl.h>
#endif // ENABLE_MULTI_DEVICE
namespace tensorrt_llm::common::op
TRTLLM_NAMESPACE_BEGIN
namespace common::op
{
class AttentionOp
@ -543,4 +546,6 @@ private:
UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
};
} // namespace tensorrt_llm::common::op
} // namespace common::op
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasVersionCheck.h"
#include <algorithm>
#include <unordered_map>
@ -24,8 +25,8 @@
#error CUDART_VERSION Undefined!
#endif
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
@ -24,8 +25,8 @@
#include <optional>
#include <string>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -185,4 +186,4 @@ public:
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,13 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif // ENABLE_BF16
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
// Operator definitions intentionally in global namespace
namespace

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <assert.h>
#include <cstdlib>
@ -28,8 +29,8 @@
#include <string>
#include <type_traits>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
static __host__ __device__ int hash(int val)
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
};
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -18,6 +18,7 @@
#if defined(_WIN32)
#include <windows.h>
#define dllOpen(name) LoadLibrary("nv" name ".dll")
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
@ -29,6 +30,7 @@
#endif // defined(_WIN32)
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaDriverWrapper.h"
#include "tensorrt_llm/common/logger.h"
#include <cuda.h>
@ -36,7 +38,9 @@
#include <cstdio>
#include <mutex>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#ifndef CUDA_DRIVER_WRAPPER_H
#define CUDA_DRIVER_WRAPPER_H
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h"
@ -25,7 +26,9 @@
#include <cstdio>
#include <memory>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
class CUDADriverWrapper
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
}
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END
/*
* Macros compliant with TensorRT coding conventions
*/

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h"
@ -24,8 +25,8 @@
#include <limits>
#include <type_traits>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
#ifdef ENABLE_FP8
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);
#endif // ENABLE_FP8
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "tensorrt_llm/common/cudaProfilerUtils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <cstdint>
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte
} // namespace
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
return std::make_pair(profileIterIdxs, stopIterIdxs);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -25,9 +25,10 @@
#if ENABLE_BF16
#include <cuda_bf16.h>
#endif
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace common
{
@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
#endif // ENABLE_FP8
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
@ -25,7 +26,9 @@
using tensorrt_llm::kernels::AllReduceFusionOp;
using tensorrt_llm::kernels::AllReduceStrategyType;
namespace tensorrt_llm::utils::customAllReduceUtils
TRTLLM_NAMESPACE_BEGIN
namespace utils::customAllReduceUtils
{
constexpr size_t NUM_POINTERS_PER_RANK = 7;
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
{90, AllReduceBestStrategyTableSM90},
{100, AllReduceBestStrategyTableSM100},
};
} // namespace tensorrt_llm::utils::customAllReduceUtils
} // namespace utils::customAllReduceUtils
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
*/
#include "envUtils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/stringUtils.h"
@ -25,7 +26,9 @@
#include <optional>
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::optional<int32_t> getIntEnv(char const* name)
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,16 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cstdint>
#include <cuda_runtime.h>
#include <optional>
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// Useful when you want to inject some debug code controllable with env var.
std::optional<int32_t> getIntEnv(char const* name);
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
bool getEnvEplbForceGdrcopy();
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -19,6 +19,7 @@
#ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
#define TRTLLM_CUDA_LAMPORT_UTILS_CUH
#include "tensorrt_llm/common/config.h"
#include <array>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
@ -29,7 +30,9 @@
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
@ -279,6 +282,7 @@ private:
}
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END
#endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH

View File

@ -15,12 +15,15 @@
*/
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h"
#include <cuda_runtime.h>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
Logger::Logger()
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
thread_local Logger instance;
return &instance;
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,10 +16,11 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cuda_runtime.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,11 +14,15 @@
* limitations under the License.
*/
#include "mcastDevMemUtils.h"
#include "tensorrt_llm/common/config.h"
#include <unordered_map>
namespace tensorrt_llm::common
using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
TRTLLM_NAMESPACE_BEGIN
namespace common
{
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
namespace
{
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
{
return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,13 +15,17 @@
*/
#pragma once
// Avoid circular dependency
#include "tensorrt_llm/common/config.h"
namespace tensorrt_llm::runtime
{
class McastDeviceMemory;
}
} // namespace tensorrt_llm::runtime
namespace tensorrt_llm::common
// Avoid circular dependency
TRTLLM_NAMESPACE_BEGIN
namespace common
{
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
// Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
// information. Thus a derived pointer cannot used as the key.
McastDeviceMemory* findMcastDevMemBuffer(void* ptr);
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"
@ -25,8 +26,8 @@
#include <sanitizer/asan_interface.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -961,4 +962,5 @@ void calcAlignedPointers(
}
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,14 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cassert>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
}
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
@ -46,7 +47,9 @@
#include <dlfcn.h>
#endif
namespace tensorrt_llm::common::nccl_util
TRTLLM_NAMESPACE_BEGIN
namespace common::nccl_util
{
//==============================================================================
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
return std::make_pair(tensor, buffer);
}
} // namespace tensorrt_llm::common::nccl_util
} // namespace common::nccl_util
TRTLLM_NAMESPACE_END
#endif // ENABLE_MULTI_DEVICE

View File

@ -25,10 +25,13 @@
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
#include "tensorrt_llm/common/config.h"
#include <array>
namespace tensorrt_llm::common::nvtx
TRTLLM_NAMESPACE_BEGIN
namespace common::nvtx
{
inline nvtx3::color nextColor()
{
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
#endif
}
} // namespace tensorrt_llm::common::nvtx
} // namespace common::nvtx
TRTLLM_NAMESPACE_END
#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \
::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)

View File

@ -29,6 +29,7 @@
#include <mutex>
#include <thread>
TRTLLM_NAMESPACE_BEGIN
#if ENABLE_MULTI_DEVICE
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
});
return creator();
}
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/workspace.h"
@ -37,7 +38,9 @@
#include <string>
#include <unordered_map>
namespace tensorrt_llm::common::op
TRTLLM_NAMESPACE_BEGIN
namespace common::op
{
// Write values into buffer
@ -178,7 +181,7 @@ struct hash
// for testing only
void const* getCommSessionHandle();
} // namespace tensorrt_llm::common::op
} // namespace common::op
inline bool isBuilding()
{
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
std::shared_ptr<cublasHandle_t> getCublasHandle();
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
TRTLLM_NAMESPACE_END
#ifndef DEBUG
#define PLUGIN_CHECK(status) \

View File

@ -16,14 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
#include <float.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
#endif // ENABLE_FP8
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -21,6 +21,7 @@
#else
#include <cooperative_groups.h>
#endif
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
@ -30,8 +31,8 @@
namespace cg = cooperative_groups;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
}
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#include "safetensors.h"
#include "nlohmann/json.hpp"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <NvInferRuntime.h>
#include <cstdint>
#include <fstream>
@ -25,7 +26,9 @@
#include <utility>
#include <vector>
namespace tensorrt_llm::common::safetensors
TRTLLM_NAMESPACE_BEGIN
namespace common::safetensors
{
using nvinfer1::DataType;
@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
{
return std::make_shared<SafeTensor>(filename);
}
} // namespace tensorrt_llm::common::safetensors
} // namespace common::safetensors
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include <NvInferRuntime.h>
#include <cstdint>
@ -23,7 +24,9 @@
#include <memory>
#include <utility>
namespace tensorrt_llm::common::safetensors
TRTLLM_NAMESPACE_BEGIN
namespace common::safetensors
{
class INdArray
{
@ -58,4 +61,6 @@ public:
virtual ~ISafeTensor() = default;
};
} // namespace tensorrt_llm::common::safetensors
} // namespace common::safetensors
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <functional>
#include <numeric>
#include <optional>
#include <sstream>
namespace tensorrt_llm::common::stl_utils
TRTLLM_NAMESPACE_BEGIN
namespace common::stl_utils
{
template <typename TInputIt, typename TOutputIt, typename TBinOp>
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
return oss.str();
}
} // namespace tensorrt_llm::common::stl_utils
} // namespace common::stl_utils
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cerrno>
#include <cstdarg>
@ -23,7 +24,9 @@
#include <iostream>
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
return values;
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,13 +14,16 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include <chrono>
#include <iomanip>
#include <sstream>
#include "tensorrt_llm/common/timestampUtils.h"
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::string getCurrentTimestamp()
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
return stream.str();
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,12 +14,17 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
std::string getCurrentTimestamp();
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "tensorrt_llm/common/tllmException.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <cinttypes>
@ -26,7 +27,9 @@
#endif
#include <sstream>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
namespace
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
return mErrorCode;
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,10 +14,13 @@
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstddef>
#include <cstdint>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// CuBLAS >= 12.9.1 requires 256-byte alignment.
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
return total;
}
}; // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -18,10 +18,11 @@
#include <cuda_runtime_api.h>
#include "cutlass/device_kernel.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace cutlass_extensions
{
@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
}
} // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -30,10 +30,11 @@
#include "cutlass/epilogue/thread/linear_combination_relu.h"
#include "cutlass/epilogue/thread/linear_combination_silu.h"
#include "cutlass_extensions/epilogue/thread/fused_activations.h"
#include "tensorrt_llm/common/config.h"
#include <cutlass/epilogue/fusion/operations.hpp>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace cutlass_extensions
{
@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
};
} // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -24,10 +24,11 @@
#include "cute/tensor.hpp"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h"
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace cutlass_extensions
{
@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
}
} // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
namespace
{
[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
[[nodiscard]] bool executorConfigIsValid(
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
{
// Make sure logic in this function matches fixExecutorConfig
if (executorConfig.getEnableChunkedContext())
@ -65,8 +66,8 @@ namespace
return true;
}
[[nodiscard]] ExecutorConfig fixExecutorConfig(
ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
[[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
{
// Make sure logic in this function matches executorConfigIsValid
auto fixedExecutorConfig = executorConfig;
@ -241,7 +242,7 @@ private:
void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
ExecutorConfig const& executorConfig, bool isEncoder,
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
{
auto const gpusPerNode = jsonConfig.getGpusPerNode();
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model
Executor::Impl::Impl(std::filesystem::path const& modelPath,
std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");
@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,
Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
ModelType const modelType, ExecutorConfig const& executorConfig,
ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
{
auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
}
Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto const& worldConfig = model->getWorldConfig();
auto const tp = worldConfig.getTensorParallelism();
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
shutdown();
}
void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto const gptModelType = [&executorConfig, &modelConfig]()
{
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw
std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto fixedExecutorConfig = ExecutorConfig{};
fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
}
void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
{
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
}
void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
std::filesystem::path const& modelPath)
{
#if ENABLE_MULTI_DEVICE

View File

@ -16,9 +16,12 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
cudaStream_t stream = 0);
}
TRTLLM_NAMESPACE_END

View File

@ -17,12 +17,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
cudaStream_t const stream = 0);
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "attentionMask.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
@ -24,8 +25,8 @@
using namespace tensorrt_llm::common;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/gptKernels.h"
#include "tensorrt_llm/runtime/iTensor.h"
@ -25,8 +26,8 @@
namespace tc = tensorrt_llm::common;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -64,4 +65,5 @@ template <typename MaskDataType>
void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,14 +14,15 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/banBadWords.h"
using namespace tensorrt_llm::common;
using namespace tensorrt_llm::runtime;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,13 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/runtime/common.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,14 +14,15 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/banRepeatNgram.h"
using namespace tensorrt_llm::common;
using namespace tensorrt_llm::runtime;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -178,4 +179,4 @@ INVOKE_BAN_REPEAT_NGRAM(__nv_bfloat16)
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,14 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/kernels/decodingCommon.h"
#include "tensorrt_llm/runtime/common.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -34,4 +35,5 @@ void invokeBanRepeatNgram(T* logits, runtime::TokenIdType const** output_ids_buf
runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,13 +14,14 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/beamSearchKernels.h"
using namespace tensorrt_llm::common;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -355,4 +356,5 @@ template void printLogProbs<float>(float const* x, int const nBS, int const nBMI
template void printLogProbs<half>(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/decodingCommon.h"
#include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK
@ -22,8 +23,8 @@
#define BEAM_SEARCH_DEBUG 0
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
static size_t constexpr kMaxBeamWidth = 1024; // Max beam width supported in TRT-LLM now
@ -88,7 +89,7 @@ struct BeamHypotheses
// Pointers related to beam search process, they are initialized in those two functions:
// [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
bool* batchDones{nullptr}; // [BS] %% self.beam_hyps_is_done whether a whole batch is finished
FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished
::tensorrt_llm::kernels::FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished
// Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer<T>::prepareIdsPtrs
int** outputIdsPtr{nullptr}; // [BS][BM, MSL] %% self.output_ids
@ -131,11 +132,11 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
@ -219,4 +220,5 @@ void printLogProbs(float const* x, int const nBS, int const nBMIn, int const nBM
#endif
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 1024, true);
#endif // FAST_BUILD
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 128, true);
#endif // FAST_BUILD
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,13 +15,15 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
// Skip V1 kernels if beam_width > kMaxBeamWidthForV1
INSTANTIATE_BEAM_SEARCH(float, 16, true);
INSTANTIATE_BEAM_SEARCH(half, 16, true);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 256, true);
#endif // FAST_BUILD
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 32, true);
#endif // FAST_BUILD
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
INSTANTIATE_BEAM_SEARCH(float, 4, false);
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 4, true);
INSTANTIATE_BEAM_SEARCH(half, 4, false);
INSTANTIATE_BEAM_SEARCH(half, 4, true);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 512, true);
#endif // FAST_BUILD
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 64, true);
#endif // FAST_BUILD
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/
#include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
INSTANTIATE_BEAM_SEARCH(float, 8, false);
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 8, true);
INSTANTIATE_BEAM_SEARCH(half, 8, false);
INSTANTIATE_BEAM_SEARCH(half, 8, true);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -18,11 +18,13 @@
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
#endif
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include "tensorrt_llm/common/stringUtils.h"
@ -31,8 +33,8 @@
using namespace tensorrt_llm::common;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -731,4 +733,5 @@ void beamSearchKernelLauncher(
T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,12 +14,12 @@
* limitations under the License.
*/
#include "buildRelativeAttentionBiasKernel.h"
#include "tensorrt_llm/common/config.h"
#include <cuda_runtime_api.h>
#include "buildRelativeAttentionBiasKernel.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels
{
@ -99,4 +99,5 @@ template void invokeBuildRelativeAttentionBias<__nv_bfloat16>(__nv_bfloat16* rel
#endif
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,10 +17,11 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -30,4 +31,5 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, T const* relat
cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -19,12 +19,15 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h"
namespace tensorrt_llm::kernels::causal_conv1d
TRTLLM_NAMESPACE_BEGIN
namespace kernels::causal_conv1d
{
template <int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
@ -490,4 +493,6 @@ template void causal_conv1d_update_cuda<float, float>(ConvParamsBase& params, cu
template void causal_conv1d_update_cuda<half, half>(ConvParamsBase& params, cudaStream_t stream);
template void causal_conv1d_update_cuda<nv_bfloat16, nv_bfloat16>(ConvParamsBase& params, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::causal_conv1d
} // namespace kernels::causal_conv1d
TRTLLM_NAMESPACE_END

View File

@ -20,11 +20,14 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace tensorrt_llm::kernels::causal_conv1d
TRTLLM_NAMESPACE_BEGIN
namespace kernels::causal_conv1d
{
#define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError())
@ -214,4 +217,6 @@ void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream);
template <typename input_t, typename weight_t>
void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::causal_conv1d
} // namespace kernels::causal_conv1d
TRTLLM_NAMESPACE_END

View File

@ -13,13 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
#include "tensorrt_llm/kernels/quantization.cuh"
#include <cooperative_groups.h>
namespace tensorrt_llm::kernels::ar_fusion
TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{
template <int NRanks>
struct SyncComm
@ -818,4 +821,6 @@ void allreduce_fusion_op(AllReduceFusionParams const& params)
DISPATCH_RANKS(16);
TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!");
}
}; // namespace tensorrt_llm::kernels::ar_fusion
}; // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -15,16 +15,19 @@
*/
#pragma once
#include "tensorrt_llm/common/assert.h"
#include <NvInferRuntime.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/quantization.h"
#include "tensorrt_llm/runtime/ipcUtils.h"
namespace tensorrt_llm::kernels::ar_fusion
TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{
template <typename DType>
struct ElemsPerAccess;
@ -139,4 +142,6 @@ struct AllReduceFusionParams
};
void allreduce_fusion_op(AllReduceFusionParams const& params);
} // namespace tensorrt_llm::kernels::ar_fusion
} // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -13,9 +13,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
namespace tensorrt_llm::kernels::ar_fusion
TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{
__global__ void lamport_initialize_kernel(float* ptr, int size)
@ -94,4 +97,6 @@ void** Workspace::get_workspace()
{
return reinterpret_cast<void**>(m_workspace);
}
}; // namespace tensorrt_llm::kernels::ar_fusion
}; // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -16,11 +16,14 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
#include "tensorrt_llm/runtime/ipcUtils.h"
namespace tensorrt_llm::kernels::ar_fusion
TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{
class Workspace
@ -41,4 +44,6 @@ private:
};
void lamport_initialize(void* ptr, int bytes, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::ar_fusion
} // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include "tensorrt_llm/common/cudaUtils.h"
@ -25,7 +26,9 @@
#include <tuple>
#include <type_traits>
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
using tensorrt_llm::common::divUp;
@ -1632,4 +1635,6 @@ void customLowPrecisionAllReduce(
sync_check_cuda_error(stream);
}
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
#include <NvInferRuntime.h>
@ -24,7 +25,9 @@
#include <cuda_fp16.h>
#include <vector>
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8;
@ -119,4 +122,6 @@ void customLowPrecisionAllReduce(
kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream);
int32_t max_workspace_size_lowprecision(int32_t tp_size);
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "mnnvlAllreduceKernels.h"
#include "tensorrt_llm/common/config.h"
#include <cooperative_groups.h>
#include <cstddef>
#include <cstdint>
@ -31,7 +32,9 @@
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
namespace tensorrt_llm::kernels::mnnvl
TRTLLM_NAMESPACE_BEGIN
namespace kernels::mnnvl
{
using tensorrt_llm::common::isNegZero;
@ -1029,4 +1032,6 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params)
}
}
} // namespace tensorrt_llm::kernels::mnnvl
} // namespace kernels::mnnvl
TRTLLM_NAMESPACE_END

View File

@ -16,11 +16,13 @@
#ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
#define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
#include "tensorrt_llm/common/config.h"
#include <NvInferRuntime.h>
#include <cstdint>
namespace tensorrt_llm::kernels::mnnvl
TRTLLM_NAMESPACE_BEGIN
namespace kernels::mnnvl
{
/**
@ -66,6 +68,7 @@ struct AllReduceFusionParams
void oneshotAllreduceFusionOp(AllReduceFusionParams const& params);
void twoshotAllreduceFusionOp(AllReduceFusionParams const& params);
} // namespace tensorrt_llm::kernels::mnnvl
} // namespace kernels::mnnvl
TRTLLM_NAMESPACE_END
#endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H

View File

@ -13,13 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h"
#include "tensorrt_llm/kernels/quantization.cuh"
#include <cooperative_groups.h>
namespace tensorrt_llm::kernels::ar_fusion::moe
TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion::moe
{
template <int NRanks>
struct LamportComm
@ -770,4 +773,6 @@ void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& par
#undef MOE_FINALIZE_DISPATCH1
}
}; // namespace tensorrt_llm::kernels::ar_fusion::moe
}; // namespace kernels::ar_fusion::moe
TRTLLM_NAMESPACE_END

View File

@ -15,16 +15,19 @@
*/
#pragma once
#include "tensorrt_llm/common/assert.h"
#include <NvInferRuntime.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/quantization.h"
#include "tensorrt_llm/runtime/ipcUtils.h"
namespace tensorrt_llm::kernels::ar_fusion::moe
TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion::moe
{
static constexpr int kElemsPerAccess = 8;
static constexpr int kOneShotMaxToken = 128;
@ -102,4 +105,6 @@ struct MoeFinalizeAllReduceFusionParams : public AllReduceFusionParams
void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params);
} // namespace tensorrt_llm::kernels::ar_fusion::moe
} // namespace kernels::ar_fusion::moe
TRTLLM_NAMESPACE_END

Some files were not shown because too many files have changed in this diff Show More