[None][fix] Introduce inline namespace to avoid symbol collision (#9541)

Signed-off-by: Yihan Wang <yihwang@nvidia.com>
This commit is contained in:
Yihan Wang 2025-12-12 23:32:15 +08:00 committed by GitHub
parent af315d8ef1
commit 9df4dad3b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
621 changed files with 4168 additions and 9576 deletions

2
.gitattributes vendored
View File

@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text
cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text

1
.gitignore vendored
View File

@ -74,6 +74,7 @@ llm-test-workspace/
cpp/include/tensorrt_llm/executor/version.h cpp/include/tensorrt_llm/executor/version.h
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
.devcontainer/.env .devcontainer/.env
/examples/layer_wise_benchmarks/profiles/ /examples/layer_wise_benchmarks/profiles/

View File

@ -1,6 +1,7 @@
/* /*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
*AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
@ -17,13 +18,16 @@
*/ */
#include "utils.h" #include "utils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include <random> #include <random>
#include <filesystem> #include <filesystem>
#include <fstream> #include <fstream>
namespace tensorrt_llm::benchmark TRTLLM_NAMESPACE_BEGIN
namespace benchmark
{ {
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input) std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
if (samples.size() < maxNumSamples) if (samples.size() < maxNumSamples)
{ {
TLLM_LOG_WARNING( TLLM_LOG_WARNING(
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n", "Dataset size %zu is smaller than given max_num_samples "
"%d, max_num_samples will be ignored.\n",
samples.size(), maxNumSamples); samples.size(), maxNumSamples);
} }
return samples; return samples;
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
return os; return os;
} }
} // namespace tensorrt_llm::benchmark } // namespace benchmark
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/executor/executor.h"
#include <cstdint> #include <cstdint>
@ -29,7 +30,9 @@
#pragma once #pragma once
namespace tensorrt_llm::benchmark TRTLLM_NAMESPACE_BEGIN
namespace benchmark
{ {
// using namespace tensorrt_llm::batch_manager; // using namespace tensorrt_llm::batch_manager;
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays); std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
} // namespace tensorrt_llm::benchmark } // namespace benchmark
TRTLLM_NAMESPACE_END

View File

@ -16,8 +16,9 @@
#pragma once #pragma once
namespace tensorrt_llm #include "tensorrt_llm/common/config.h"
{
TRTLLM_NAMESPACE_BEGIN
// Base class for algorithms // Base class for algorithms
struct Algorithm struct Algorithm
@ -29,4 +30,4 @@ struct Algorithm
Algorithm& operator=(Algorithm const&) = delete; Algorithm& operator=(Algorithm const&) = delete;
}; };
} // namespace tensorrt_llm TRTLLM_NAMESPACE_END

View File

@ -17,9 +17,13 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cstdint> #include <cstdint>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
//! //!
@ -100,4 +104,6 @@ private:
size_type mSize; size_type mSize;
}; };
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,14 +16,19 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/tllmException.h"
TRTLLM_NAMESPACE_BEGIN
class DebugConfig class DebugConfig
{ {
public: public:
static bool isCheckDebugEnabled(); static bool isCheckDebugEnabled();
}; };
TRTLLM_NAMESPACE_END
#if defined(_WIN32) #if defined(_WIN32)
#define TLLM_LIKELY(x) (__assume((x) == 1), (x)) #define TLLM_LIKELY(x) (__assume((x) == 1), (x))
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x)) #define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
@ -35,8 +40,8 @@ public:
#define TLLM_CHECK(val) \ #define TLLM_CHECK(val) \
do \ do \
{ \ { \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \ TLLM_LIKELY(static_cast<bool>(val)) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} while (0) } while (0)
#define TLLM_CHECK_WITH_INFO(val, info, ...) \ #define TLLM_CHECK_WITH_INFO(val, info, ...) \
@ -51,17 +56,17 @@ public:
#define TLLM_CHECK_DEBUG(val) \ #define TLLM_CHECK_DEBUG(val) \
do \ do \
{ \ { \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
{ \ { \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \ TLLM_LIKELY(static_cast<bool>(val)) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} \ } \
} while (0) } while (0)
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \ #define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
do \ do \
{ \ { \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
{ \ { \
TLLM_LIKELY(static_cast<bool>(val)) \ TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) \ ? ((void) 0) \

View File

@ -17,9 +17,13 @@
#pragma once #pragma once
#include "c10/util/intrusive_ptr.h" #include "c10/util/intrusive_ptr.h"
#include "tensorrt_llm/common/config.h"
#include <Python.h> #include <Python.h>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
// Adapted from pybind11's example implementation: // Adapted from pybind11's example implementation:
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
return c10::intrusive_ptr<T>::reclaim_copy(p); return c10::intrusive_ptr<T>::reclaim_copy(p);
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef TRTLLM_CONFIG_H
#define TRTLLM_CONFIG_H
/**
* \def TRTLLM_ABI_NAMESPACE
* This macro is used to open an implicitly inline namespace block for the ABI version.
* This macro can be overridden to change the ABI version.
* The default ABI version is _v1.
*/
#ifndef TRTLLM_ABI_NAMESPACE
#define TRTLLM_ABI_NAMESPACE _v1
#endif
#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
#define TRTLLM_ABI_NAMESPACE_BEGIN \
inline namespace TRTLLM_ABI_NAMESPACE \
{
#endif
#ifndef TRTLLM_ABI_NAMESPACE_END
#define TRTLLM_ABI_NAMESPACE_END }
#endif
/**
* \def TRTLLM_NAMESPACE_BEGIN
* This macro is used to open a `tensorrt_llm::` namespace block, along with any
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
* This macro is defined by TensorRT-LLM and may not be overridden.
*/
#define TRTLLM_NAMESPACE_BEGIN \
namespace tensorrt_llm \
{ \
TRTLLM_ABI_NAMESPACE_BEGIN
/**
* \def TRTLLM_NAMESPACE_END
* This macro is used to close a `tensorrt_llm::` namespace block, along with any
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
* This macro is defined by TensorRT-LLM and may not be overridden.
*/
#define TRTLLM_NAMESPACE_END \
TRTLLM_ABI_NAMESPACE_END \
} /* end namespace tensorrt_llm */
#endif // TRTLLM_CONFIG_H

View File

@ -16,6 +16,8 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#ifdef ENABLE_FP8 #ifdef ENABLE_FP8
#include <cuda_fp8.h> #include <cuda_fp8.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -29,8 +31,8 @@
#define USE_QGMMA #define USE_QGMMA
#endif #endif
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream); const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
#endif // ENABLE_FP8 #endif // ENABLE_FP8

View File

@ -14,12 +14,18 @@
* limitations under the License. * limitations under the License.
*/ */
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstdint> #include <cstdint>
#include <optional> #include <optional>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
/// @brief Populate the start and end profiling iteration indexes from the provided environment variables /// @brief Populate the start and end profiling iteration indexes from the provided environment variables
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes( std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt); std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaDriverWrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaFp8Utils.h"
@ -49,7 +50,9 @@
// this undef. // this undef.
#endif // WIN32 #endif // WIN32
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
// workspace for cublas gemm : 32MB // workspace for cublas gemm : 32MB
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
DEFINE_MEMBER_CHECKER(qua) DEFINE_MEMBER_CHECKER(qua)
DEFINE_MEMBER_CHECKER(high_preciecion_normed_output) DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END
/* /*
* Macros compliant with TensorRT coding conventions * Macros compliant with TensorRT coding conventions

View File

@ -16,11 +16,15 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/tllmException.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
#include <map> #include <map>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
constexpr static size_t getDTypeSize(nvinfer1::DataType type) constexpr static size_t getDTypeSize(nvinfer1::DataType type)
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
return ""; return "";
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -22,9 +22,12 @@
#include <string> #include <string>
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
class Logger class Logger
@ -125,12 +128,12 @@ private:
static inline std::string getPrefix(Level const level) static inline std::string getPrefix(Level const level)
{ {
return fmtstr("%s[%s] ", kPREFIX, getLevelName(level)); return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
} }
static inline std::string getPrefix(Level const level, int const rank) static inline std::string getPrefix(Level const level, int const rank)
{ {
return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank); return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
} }
}; };
@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
out << std::endl; out << std::endl;
} }
} }
} // namespace common
TRTLLM_NAMESPACE_END
#define TLLM_LOG(level, ...) \ #define TLLM_LOG(level, ...) \
do \ do \
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__) #define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__) #define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__) #define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
} // namespace tensorrt_llm::common

View File

@ -16,11 +16,15 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <optional> #include <optional>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
/** /**
@ -100,4 +104,6 @@ public:
} }
}; };
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <cstdint> #include <cstdint>
#include <optional> #include <optional>
#include <string> #include <string>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -480,4 +482,5 @@ public:
}; };
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#if ENABLE_BF16 #if ENABLE_BF16
#include <cuda_bf16.h> #include <cuda_bf16.h>
#endif // ENABLE_BF16 #endif // ENABLE_BF16
@ -28,7 +29,9 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
#if ENABLE_BF16 #if ENABLE_BF16
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val) static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
} }
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
#include <array> #include <array>
@ -41,7 +42,9 @@
tensorrt_llm::common::RequestSpecificException( \ tensorrt_llm::common::RequestSpecificException( \
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode) __FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
/// @brief Enumeration of different error codes for request-specific exceptions /// @brief Enumeration of different error codes for request-specific exceptions
@ -77,7 +80,8 @@ private:
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info) [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
{ {
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); throw TllmException(
file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
} }
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
@ -102,4 +106,6 @@ private:
RequestErrorCode mErrorCode; RequestErrorCode mErrorCode;
}; };
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,8 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <string> #include <string>
@ -24,7 +26,9 @@
#include <pthread.h> #include <pthread.h>
#endif #endif
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
inline bool setThreadName(std::string const& name) inline bool setThreadName(std::string const& name)
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
return std::find(c.begin(), c.end(), v) != c.end(); return std::find(c.begin(), c.end(), v) != c.end();
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,7 +16,11 @@
#pragma once #pragma once
namespace tensorrt_llm::kernels #include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
namespace detail namespace detail
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;
} // namespace arch } // namespace arch
} // namespace tensorrt_llm::kernels } // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,11 +17,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/executor/types.h"
#include <cstdint> #include <cstdint>
#include <curand_kernel.h> #include <curand_kernel.h>
namespace tensorrt_llm::kernels TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
class FinishedState class FinishedState
@ -308,4 +311,6 @@ template <typename T>
void invokeScatterDecodingParams( void invokeScatterDecodingParams(
T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream); T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);
} // namespace tensorrt_llm::kernels } // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,11 +17,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cstdint> #include <cstdint>
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace tensorrt_llm::kernels TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
class KVCacheIndex class KVCacheIndex
@ -53,4 +56,6 @@ private:
UnderlyingType value; UnderlyingType value;
}; };
} // namespace tensorrt_llm::kernels } // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -14,16 +14,18 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iBuffer.h"
using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::runtime;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads, void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor, unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
cudaStream_t stream); cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -2175,7 +2175,8 @@ def get_kernel_code(kspec, kname, lname):
params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params' params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;' attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;' bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else '' include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;' num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}' fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}' const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
@ -2201,8 +2202,19 @@ def get_kernel_code(kspec, kname, lname):
const int COMPUTE_REG_COUNT = {compute_reg_count}; const int COMPUTE_REG_COUNT = {compute_reg_count};
asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format( asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
compute_reg_count=compute_reg_count) compute_reg_count=compute_reg_count)
local_ns_open = ns_open if generate_cu_trtllm else '' abi_ns_open = r"""
local_ns_close = ns_close if generate_cu_trtllm else '' TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
// clang-format off
"""
abi_ns_close = r"""
// clang-format on
} // namespace kernels
TRTLLM_NAMESPACE_END
"""
local_ns_open = abi_ns_open if generate_cu_trtllm else ''
local_ns_close = abi_ns_close if generate_cu_trtllm else ''
tmp = dict(locals(), **kspec._asdict()) tmp = dict(locals(), **kspec._asdict())
@ -3077,8 +3089,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
def get_cubin_header(kernel_traits, specs_names): def get_cubin_header(kernel_traits, specs_names):
cubins = [] cubins = []
cubin_lens = [] cubin_lens = []
launchers = []
cubins_dict = {} cubins_dict = {}
cubin_lens_dict = {} cubin_lens_dict = {}
launchers_dict = {}
for kspec, fname, lname, kname in specs_names: for kspec, fname, lname, kname in specs_names:
if generate_cu_trtllm and not use_cubin_header( if generate_cu_trtllm and not use_cubin_header(
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype): kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
@ -3282,11 +3296,11 @@ def get_cubin_header(kernel_traits, specs_names):
if generate_cu_trtllm and lname != 'nullptr': if generate_cu_trtllm and lname != 'nullptr':
launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format( launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
lname=lname) lname=lname)
if int(sm) in cubins_dict: if int(sm) in launchers_dict:
if launcher not in cubins_dict[int(sm)]: if launcher not in launchers_dict[int(sm)]:
cubins_dict[int(sm)].append(launcher) launchers_dict[int(sm)].append(launcher)
else: else:
cubins_dict[int(sm)] = [launcher] launchers_dict[int(sm)] = [launcher]
elif 'mhca' in kname: elif 'mhca' in kname:
code = '''\ code = '''\
{{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\ {{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
@ -3309,17 +3323,33 @@ def get_cubin_header(kernel_traits, specs_names):
else: else:
metadata_v2 = ',\n'.join(metadata_v2) metadata_v2 = ',\n'.join(metadata_v2)
# Add macros to only include needed cubins during compilation. # Add macros to only include needed cubins during compilation.
for sm in cubins_dict.keys(): # Collect all SM versions from all dictionaries
all_sms = sorted(
set(
list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
list(launchers_dict.keys())))
for sm in all_sms:
macro_begin = f"#ifndef EXCLUDE_SM_{sm}" macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
macro_end = f"#endif\n" macro_end = f"#endif\n"
# Add cubin array declarations
if sm in cubins_dict:
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end]) cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
# Add cubin length declarations
if sm in cubin_lens_dict: if sm in cubin_lens_dict:
cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end]) cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
# Add launcher declarations
if sm in launchers_dict:
launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
unroll_config_v1 = ',\n'.join(unroll_config_v1) unroll_config_v1 = ',\n'.join(unroll_config_v1)
unroll_config_v2 = ',\n'.join(unroll_config_v2) unroll_config_v2 = ',\n'.join(unroll_config_v2)
cubins = '\n'.join(cubins) cubins = '\n'.join(cubins)
cubin_lens = '\n'.join(cubin_lens) cubin_lens = '\n'.join(cubin_lens)
launchers = '\n'.join(launchers)
local_ns_open = ns_open local_ns_open = ns_open
local_ns_close = ns_close if generate_cu_trtllm else '}' local_ns_close = ns_close if generate_cu_trtllm else '}'
launcher_line = ''' launcher_line = '''
@ -3431,7 +3461,157 @@ static const struct TestMetaV2
'''.format(**locals(), copyright=copyright) '''.format(**locals(), copyright=copyright)
return code # Generate header content (.h file)
if "GENERATE_CUBIN" in os.environ:
header_content = '''\
{copyright}
#pragma once
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels{{
struct FusedMultiHeadAttentionKernelMetaInfoV2
{{
Data_type mDataTypeIn;
Data_type mDataTypeOut;
unsigned int mS;
unsigned int mStepQ;
unsigned int mStepKV;
unsigned int mD;
unsigned int mDV;
unsigned int mSageBlockSizeQ;
unsigned int mSageBlockSizeK;
unsigned int mSageBlockSizeV;
unsigned int mSM;
const unsigned char* mCubin;
unsigned int mCubinSize;
const char* mFuncName;
unsigned int mSharedMemBytes;
unsigned int mThreadsPerCTA;
unsigned int mUnrollStep;
int mAttentionMaskType;
int mAttentionInputLayout;
bool mInterleaved;
bool mFlashAttention;
bool mWarpSpecialization;
bool mFP32Accumulation;
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
extern const int sMhaKernelMetaInfosV2Size;
}} // namespace kernels
TRTLLM_NAMESPACE_END
'''.format(**locals(), copyright=copyright)
# Generate source content (.cpp file)
source_content = '''\
{copyright}
#include "tensorrt_llm/common/config.h"
#include <cstddef>
#include <cstdint>
#include <cuda_runtime_api.h>
{local_ns_open}
//--- Cubin Arrays
{cubins}
//--- Cubin Lengths
{cubin_lens}
{local_ns_close}
using namespace tensorrt_llm::kernels;
namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
class Fused_multihead_attention_params_v2;
class Launch_params;
//--- Kernel Launchers
{launchers}
// FIXME: These are duplicated declarations, we should remove them in the future.
constexpr int32_t kSM_70 = 70;
constexpr int32_t kSM_72 = 72;
constexpr int32_t kSM_75 = 75;
constexpr int32_t kSM_80 = 80;
constexpr int32_t kSM_86 = 86;
constexpr int32_t kSM_89 = 89;
constexpr int32_t kSM_90 = 90;
constexpr int32_t kSM_100 = 100;
constexpr int32_t kSM_100f = 10100;
constexpr int32_t kSM_103 = 103;
constexpr int32_t kSM_120 = 120;
constexpr int32_t kSM_121 = 121;
// FIXME: These are duplicated declarations, we should remove them in the future.
enum Data_type
{{
DATA_TYPE_BOOL,
DATA_TYPE_FP16,
DATA_TYPE_FP32,
DATA_TYPE_INT4,
DATA_TYPE_INT8,
DATA_TYPE_INT32,
DATA_TYPE_BF16,
DATA_TYPE_E2M1,
DATA_TYPE_E4M3,
DATA_TYPE_E5M2
}};
struct FusedMultiHeadAttentionKernelMetaInfoV2
{{
Data_type mDataTypeIn;
Data_type mDataTypeOut;
unsigned int mS;
unsigned int mStepQ;
unsigned int mStepKV;
unsigned int mD;
unsigned int mDV;
unsigned int mSageBlockSizeQ;
unsigned int mSageBlockSizeK;
unsigned int mSageBlockSizeV;
unsigned int mSM;
const unsigned char* mCubin;
unsigned int mCubinSize;
const char* mFuncName;
unsigned int mSharedMemBytes;
unsigned int mThreadsPerCTA;
unsigned int mUnrollStep;
int mAttentionMaskType;
int mAttentionInputLayout;
bool mInterleaved;
bool mFlashAttention;
bool mWarpSpecialization;
bool mFP32Accumulation;
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
{metadata_v2}
}};
extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
'''.format(**locals(), copyright=copyright)
else:
# Non-GENERATE_CUBIN mode: use old behavior
header_content = code
source_content = None
return header_content, source_content
# This is used to add some kernels running in cubins for passing CI cases. # This is used to add some kernels running in cubins for passing CI cases.
@ -3449,9 +3629,20 @@ def modify_cubin_header(cubin_header):
return result return result
target = "#ifndef EXCLUDE_SM_80" target = "#ifndef EXCLUDE_SM_80"
addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[]; addition_cubin_array = """
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;""" #ifndef EXCLUDE_SM_80
result = add_kernel_line(result, target, addition) extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
#endif
"""
addition_cubin_length = """
#ifndef EXCLUDE_SM_80
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
#endif
"""
# Add cubin array and length into there corresponding sections.
result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
result = add_kernel_line(result, "//--- Cubin Lengths",
addition_cubin_length)
def modify_kernel_line(result, target, new_line): def modify_kernel_line(result, target, new_line):
lines = result.split('\n') lines = result.split('\n')
@ -3534,13 +3725,22 @@ def generate_files(specs_names):
output = output.decode('utf-8').strip() output = output.decode('utf-8').strip()
# this gives: kname, smem bytes, threads_per_cta, loop_step # this gives: kname, smem bytes, threads_per_cta, loop_step
kernel_traits = [traits.split() for traits in output.splitlines()] kernel_traits = [traits.split() for traits in output.splitlines()]
cubin_header = get_cubin_header(kernel_traits, valid_specs_names) # Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
# To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
cubin_header, cubin_source = get_cubin_header(kernel_traits,
valid_specs_names)
if generate_cu_trtllm: if generate_cu_trtllm:
cubin_header = modify_cubin_header(cubin_header) cubin_source = modify_cubin_header(cubin_source)
# Write fmha_cubin.h file
with open('./generated/fmha_cubin.h', 'w') as f: with open('./generated/fmha_cubin.h', 'w') as f:
f.write(cubin_header) f.write(cubin_header)
# Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
if cubin_source is not None:
with open('./generated/fmha_cubin.cpp', 'w') as f:
f.write(cubin_source)
def enumerate_hgmma_tma_kernels(specs, sm=90): def enumerate_hgmma_tma_kernels(specs, sm=90):
specs.append( specs.append(

View File

@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
*/ */
#pragma once #pragma once
namespace tensorrt_llm { #include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels { namespace kernels {
''' '''
@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
''' '''
TEMPLATE_EPILOGUE = '''} TEMPLATE_EPILOGUE = '''}
} TRTLLM_NAMESPACE_END
''' '''
D = defaultdict(list) D = defaultdict(list)

View File

@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
namespace tensorrt_llm
{ #include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels namespace kernels
{ {
// clang-format off // clang-format off
@ -96,7 +98,7 @@ namespace kernels
cpp_file_suffex_text = R""" cpp_file_suffex_text = R"""
// clang-format on // clang-format on
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm TRTLLM_NAMESPACE_END
""" """
cubin_meta_info_struct_prefix_text = R""" cubin_meta_info_struct_prefix_text = R"""

View File

@ -27,7 +27,7 @@ bool initCheckDebug()
} }
} // namespace } // namespace
bool DebugConfig::isCheckDebugEnabled() bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
{ {
static bool const debugEnabled = initCheckDebug(); static bool const debugEnabled = initCheckDebug();
return debugEnabled; return debugEnabled;

View File

@ -16,6 +16,7 @@
*/ */
#include "attentionOp.h" #include "attentionOp.h"
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/memoryUtils.h"

View File

@ -16,6 +16,7 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/common/opUtils.h"
#include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/common/quantization.h"
@ -36,7 +37,9 @@
#include <nccl.h> #include <nccl.h>
#endif // ENABLE_MULTI_DEVICE #endif // ENABLE_MULTI_DEVICE
namespace tensorrt_llm::common::op TRTLLM_NAMESPACE_BEGIN
namespace common::op
{ {
class AttentionOp class AttentionOp
@ -543,4 +546,6 @@ private:
UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {}; UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
}; };
} // namespace tensorrt_llm::common::op } // namespace common::op
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasVersionCheck.h" #include "tensorrt_llm/common/cublasVersionCheck.h"
#include <algorithm> #include <algorithm>
#include <unordered_map> #include <unordered_map>
@ -24,8 +25,8 @@
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
#endif #endif
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t
} // namespace common } // namespace common
} // namespace tensorrt_llm TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h> #include <cublas_v2.h>
@ -24,8 +25,8 @@
#include <optional> #include <optional>
#include <string> #include <string>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -185,4 +186,4 @@ public:
} // namespace common } // namespace common
} // namespace tensorrt_llm TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,13 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif // ENABLE_BF16 #endif // ENABLE_BF16
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
// Operator definitions intentionally in global namespace // Operator definitions intentionally in global namespace
namespace namespace

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include <assert.h> #include <assert.h>
#include <cstdlib> #include <cstdlib>
@ -28,8 +29,8 @@
#include <string> #include <string>
#include <type_traits> #include <type_traits>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
static __host__ __device__ int hash(int val) static __host__ __device__ int hash(int val)
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
}; };
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -18,6 +18,7 @@
#if defined(_WIN32) #if defined(_WIN32)
#include <windows.h> #include <windows.h>
#define dllOpen(name) LoadLibrary("nv" name ".dll") #define dllOpen(name) LoadLibrary("nv" name ".dll")
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle)) #define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name)) #define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
@ -29,6 +30,7 @@
#endif // defined(_WIN32) #endif // defined(_WIN32)
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaDriverWrapper.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include <cuda.h> #include <cuda.h>
@ -36,7 +38,9 @@
#include <cstdio> #include <cstdio>
#include <mutex> #include <mutex>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance() std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config); return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#ifndef CUDA_DRIVER_WRAPPER_H #ifndef CUDA_DRIVER_WRAPPER_H
#define CUDA_DRIVER_WRAPPER_H #define CUDA_DRIVER_WRAPPER_H
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/tllmException.h"
@ -25,7 +26,9 @@
#include <cstdio> #include <cstdio>
#include <memory> #include <memory>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
class CUDADriverWrapper class CUDADriverWrapper
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
} }
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END
/* /*
* Macros compliant with TensorRT coding conventions * Macros compliant with TensorRT coding conventions
*/ */

View File

@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/envUtils.h"
@ -24,8 +25,8 @@
#include <limits> #include <limits>
#include <type_traits> #include <type_traits>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
#ifdef ENABLE_FP8 #ifdef ENABLE_FP8
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);
#endif // ENABLE_FP8 #endif // ENABLE_FP8
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/ */
#include "tensorrt_llm/common/cudaProfilerUtils.h" #include "tensorrt_llm/common/cudaProfilerUtils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
#include <cstdint> #include <cstdint>
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte
} // namespace } // namespace
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes( std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
return std::make_pair(profileIterIdxs, stopIterIdxs); return std::make_pair(profileIterIdxs, stopIterIdxs);
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -25,9 +25,10 @@
#if ENABLE_BF16 #if ENABLE_BF16
#include <cuda_bf16.h> #include <cuda_bf16.h>
#endif #endif
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace common namespace common
{ {
@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
#endif // ENABLE_FP8 #endif // ENABLE_FP8
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/kernels/customAllReduceKernels.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h"
@ -25,7 +26,9 @@
using tensorrt_llm::kernels::AllReduceFusionOp; using tensorrt_llm::kernels::AllReduceFusionOp;
using tensorrt_llm::kernels::AllReduceStrategyType; using tensorrt_llm::kernels::AllReduceStrategyType;
namespace tensorrt_llm::utils::customAllReduceUtils TRTLLM_NAMESPACE_BEGIN
namespace utils::customAllReduceUtils
{ {
constexpr size_t NUM_POINTERS_PER_RANK = 7; constexpr size_t NUM_POINTERS_PER_RANK = 7;
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
{90, AllReduceBestStrategyTableSM90}, {90, AllReduceBestStrategyTableSM90},
{100, AllReduceBestStrategyTableSM100}, {100, AllReduceBestStrategyTableSM100},
}; };
} // namespace tensorrt_llm::utils::customAllReduceUtils } // namespace utils::customAllReduceUtils
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
*/ */
#include "envUtils.h" #include "envUtils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
@ -25,7 +26,9 @@
#include <optional> #include <optional>
#include <string> #include <string>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
std::optional<int32_t> getIntEnv(char const* name) std::optional<int32_t> getIntEnv(char const* name)
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,16 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include <cstdint> #include <cstdint>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <optional> #include <optional>
#include <string> #include <string>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
// Useful when you want to inject some debug code controllable with env var. // Useful when you want to inject some debug code controllable with env var.
std::optional<int32_t> getIntEnv(char const* name); std::optional<int32_t> getIntEnv(char const* name);
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
bool getEnvEplbForceGdrcopy(); bool getEnvEplbForceGdrcopy();
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -19,6 +19,7 @@
#ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH #ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
#define TRTLLM_CUDA_LAMPORT_UTILS_CUH #define TRTLLM_CUDA_LAMPORT_UTILS_CUH
#include "tensorrt_llm/common/config.h"
#include <array> #include <array>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
@ -29,7 +30,9 @@
#include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh"
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
constexpr uint16_t kNEGZERO_FP16 = 0x8000U; constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
@ -279,6 +282,7 @@ private:
} }
}; };
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END
#endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH #endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH

View File

@ -15,12 +15,15 @@
*/ */
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/tllmException.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
Logger::Logger() Logger::Logger()
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
thread_local Logger instance; thread_local Logger instance;
return &instance; return &instance;
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,10 +16,11 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,11 +14,15 @@
* limitations under the License. * limitations under the License.
*/ */
#include "mcastDevMemUtils.h" #include "mcastDevMemUtils.h"
#include "tensorrt_llm/common/config.h"
#include <unordered_map> #include <unordered_map>
namespace tensorrt_llm::common using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
namespace namespace
{ {
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
{ {
return McastDevMemBufferRegistry::getInstance().findBuffer(ptr); return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,13 +15,17 @@
*/ */
#pragma once #pragma once
// Avoid circular dependency #include "tensorrt_llm/common/config.h"
namespace tensorrt_llm::runtime namespace tensorrt_llm::runtime
{ {
class McastDeviceMemory; class McastDeviceMemory;
} } // namespace tensorrt_llm::runtime
namespace tensorrt_llm::common // Avoid circular dependency
TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory; using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
// Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer! // Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
// information. Thus a derived pointer cannot used as the key. // information. Thus a derived pointer cannot used as the key.
McastDeviceMemory* findMcastDevMemBuffer(void* ptr); McastDeviceMemory* findMcastDevMemBuffer(void* ptr);
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/ */
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/memoryUtils.h"
@ -25,8 +26,8 @@
#include <sanitizer/asan_interface.h> #include <sanitizer/asan_interface.h>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -961,4 +962,5 @@ void calcAlignedPointers(
} }
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include <cassert> #include <cassert>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
} }
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
@ -46,7 +47,9 @@
#include <dlfcn.h> #include <dlfcn.h>
#endif #endif
namespace tensorrt_llm::common::nccl_util TRTLLM_NAMESPACE_BEGIN
namespace common::nccl_util
{ {
//============================================================================== //==============================================================================
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
return std::make_pair(tensor, buffer); return std::make_pair(tensor, buffer);
} }
} // namespace tensorrt_llm::common::nccl_util } // namespace common::nccl_util
TRTLLM_NAMESPACE_END
#endif // ENABLE_MULTI_DEVICE #endif // ENABLE_MULTI_DEVICE

View File

@ -25,10 +25,13 @@
#if defined(__clang__) #if defined(__clang__)
#pragma clang diagnostic pop #pragma clang diagnostic pop
#endif #endif
#include "tensorrt_llm/common/config.h"
#include <array> #include <array>
namespace tensorrt_llm::common::nvtx TRTLLM_NAMESPACE_BEGIN
namespace common::nvtx
{ {
inline nvtx3::color nextColor() inline nvtx3::color nextColor()
{ {
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
#endif #endif
} }
} // namespace tensorrt_llm::common::nvtx } // namespace common::nvtx
TRTLLM_NAMESPACE_END
#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \ #define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \
::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name) ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range) #define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)

View File

@ -29,6 +29,7 @@
#include <mutex> #include <mutex>
#include <thread> #include <thread>
TRTLLM_NAMESPACE_BEGIN
#if ENABLE_MULTI_DEVICE #if ENABLE_MULTI_DEVICE
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap() std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
}); });
return creator(); return creator();
} }
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/workspace.h" #include "tensorrt_llm/common/workspace.h"
@ -37,7 +38,9 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
namespace tensorrt_llm::common::op TRTLLM_NAMESPACE_BEGIN
namespace common::op
{ {
// Write values into buffer // Write values into buffer
@ -178,7 +181,7 @@ struct hash
// for testing only // for testing only
void const* getCommSessionHandle(); void const* getCommSessionHandle();
} // namespace tensorrt_llm::common::op } // namespace common::op
inline bool isBuilding() inline bool isBuilding()
{ {
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
std::shared_ptr<cublasHandle_t> getCublasHandle(); std::shared_ptr<cublasHandle_t> getCublasHandle();
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle(); std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
TRTLLM_NAMESPACE_END
#ifndef DEBUG #ifndef DEBUG
#define PLUGIN_CHECK(status) \ #define PLUGIN_CHECK(status) \

View File

@ -16,14 +16,15 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaFp8Utils.h"
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <float.h> #include <float.h>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
#endif // ENABLE_FP8 #endif // ENABLE_FP8
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -21,6 +21,7 @@
#else #else
#include <cooperative_groups.h> #include <cooperative_groups.h>
#endif #endif
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -30,8 +31,8 @@
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace common namespace common
{ {
@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
} }
} // namespace common } // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#include "safetensors.h" #include "safetensors.h"
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
#include <cstdint> #include <cstdint>
#include <fstream> #include <fstream>
@ -25,7 +26,9 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
namespace tensorrt_llm::common::safetensors TRTLLM_NAMESPACE_BEGIN
namespace common::safetensors
{ {
using nvinfer1::DataType; using nvinfer1::DataType;
@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
{ {
return std::make_shared<SafeTensor>(filename); return std::make_shared<SafeTensor>(filename);
} }
} // namespace tensorrt_llm::common::safetensors } // namespace common::safetensors
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
#include <cstdint> #include <cstdint>
@ -23,7 +24,9 @@
#include <memory> #include <memory>
#include <utility> #include <utility>
namespace tensorrt_llm::common::safetensors TRTLLM_NAMESPACE_BEGIN
namespace common::safetensors
{ {
class INdArray class INdArray
{ {
@ -58,4 +61,6 @@ public:
virtual ~ISafeTensor() = default; virtual ~ISafeTensor() = default;
}; };
} // namespace tensorrt_llm::common::safetensors } // namespace common::safetensors
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,15 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <functional> #include <functional>
#include <numeric> #include <numeric>
#include <optional> #include <optional>
#include <sstream> #include <sstream>
namespace tensorrt_llm::common::stl_utils TRTLLM_NAMESPACE_BEGIN
namespace common::stl_utils
{ {
template <typename TInputIt, typename TOutputIt, typename TBinOp> template <typename TInputIt, typename TOutputIt, typename TBinOp>
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
return oss.str(); return oss.str();
} }
} // namespace tensorrt_llm::common::stl_utils } // namespace common::stl_utils
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cerrno> #include <cerrno>
#include <cstdarg> #include <cstdarg>
@ -23,7 +24,9 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args) void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
return values; return values;
}; };
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,13 +14,16 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include <chrono> #include <chrono>
#include <iomanip> #include <iomanip>
#include <sstream> #include <sstream>
#include "tensorrt_llm/common/timestampUtils.h" #include "tensorrt_llm/common/timestampUtils.h"
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
std::string getCurrentTimestamp() std::string getCurrentTimestamp()
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
return stream.str(); return stream.str();
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,12 +14,17 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include <string> #include <string>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu" /// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
std::string getCurrentTimestamp(); std::string getCurrentTimestamp();
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/ */
#include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/tllmException.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
#include <cinttypes> #include <cinttypes>
@ -26,7 +27,9 @@
#endif #endif
#include <sstream> #include <sstream>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
namespace namespace
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
return mErrorCode; return mErrorCode;
} }
} // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,10 +14,13 @@
* limitations under the License. * limitations under the License.
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
namespace tensorrt_llm::common TRTLLM_NAMESPACE_BEGIN
namespace common
{ {
// CuBLAS >= 12.9.1 requires 256-byte alignment. // CuBLAS >= 12.9.1 requires 256-byte alignment.
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
return total; return total;
} }
}; // namespace tensorrt_llm::common } // namespace common
TRTLLM_NAMESPACE_END

View File

@ -18,10 +18,11 @@
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include "cutlass/device_kernel.h" #include "cutlass/device_kernel.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace cutlass_extensions namespace cutlass_extensions
{ {
@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
} }
} // namespace cutlass_extensions } // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -30,10 +30,11 @@
#include "cutlass/epilogue/thread/linear_combination_relu.h" #include "cutlass/epilogue/thread/linear_combination_relu.h"
#include "cutlass/epilogue/thread/linear_combination_silu.h" #include "cutlass/epilogue/thread/linear_combination_silu.h"
#include "cutlass_extensions/epilogue/thread/fused_activations.h" #include "cutlass_extensions/epilogue/thread/fused_activations.h"
#include "tensorrt_llm/common/config.h"
#include <cutlass/epilogue/fusion/operations.hpp> #include <cutlass/epilogue/fusion/operations.hpp>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace cutlass_extensions namespace cutlass_extensions
{ {
@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
}; };
} // namespace cutlass_extensions } // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -24,10 +24,11 @@
#include "cute/tensor.hpp" #include "cute/tensor.hpp"
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/tllmException.h"
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace cutlass_extensions namespace cutlass_extensions
{ {
@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
} }
} // namespace cutlass_extensions } // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
namespace namespace
{ {
[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig) [[nodiscard]] bool executorConfigIsValid(
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
{ {
// Make sure logic in this function matches fixExecutorConfig // Make sure logic in this function matches fixExecutorConfig
if (executorConfig.getEnableChunkedContext()) if (executorConfig.getEnableChunkedContext())
@ -65,8 +66,8 @@ namespace
return true; return true;
} }
[[nodiscard]] ExecutorConfig fixExecutorConfig( [[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig) ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
{ {
// Make sure logic in this function matches executorConfigIsValid // Make sure logic in this function matches executorConfigIsValid
auto fixedExecutorConfig = executorConfig; auto fixedExecutorConfig = executorConfig;
@ -241,7 +242,7 @@ private:
void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt, void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig, std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
ExecutorConfig const& executorConfig, bool isEncoder, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt) std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
{ {
auto const gpusPerNode = jsonConfig.getGpusPerNode(); auto const gpusPerNode = jsonConfig.getGpusPerNode();
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model
Executor::Impl::Impl(std::filesystem::path const& modelPath, Executor::Impl::Impl(std::filesystem::path const& modelPath,
std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType, std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
ExecutorConfig const& executorConfig) ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{ {
auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json"); auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");
@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,
Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr, Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr, std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
ModelType const modelType, ExecutorConfig const& executorConfig, ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt) std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
{ {
auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr); auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
} }
Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel, Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
ExecutorConfig const& executorConfig) ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{ {
auto const& worldConfig = model->getWorldConfig(); auto const& worldConfig = model->getWorldConfig();
auto const tp = worldConfig.getTensorParallelism(); auto const tp = worldConfig.getTensorParallelism();
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
shutdown(); shutdown();
} }
void Executor::Impl::initialize(ExecutorConfig const& executorConfig) void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{ {
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine, std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
ExecutorConfig const& executorConfig) ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{ {
auto const gptModelType = [&executorConfig, &modelConfig]() auto const gptModelType = [&executorConfig, &modelConfig]()
{ {
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw
std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine, std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
ExecutorConfig const& executorConfig) ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{ {
auto fixedExecutorConfig = ExecutorConfig{}; auto fixedExecutorConfig = ExecutorConfig{};
fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig()); fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
} }
void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp, void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
ExecutorConfig const& executorConfig, std::optional<ModelType> modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig, std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig) std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
{ {
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
} }
void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp, void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
std::filesystem::path const& modelPath) std::filesystem::path const& modelPath)
{ {
#if ENABLE_MULTI_DEVICE #if ENABLE_MULTI_DEVICE

View File

@ -16,9 +16,12 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm::kernels TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache, void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
cudaStream_t stream = 0); cudaStream_t stream = 0);
} }
TRTLLM_NAMESPACE_END

View File

@ -17,12 +17,15 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm::kernels TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux, void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0, int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048, int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
cudaStream_t const stream = 0); cudaStream_t const stream = 0);
} // namespace tensorrt_llm::kernels } // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/ */
#include "attentionMask.h" #include "attentionMask.h"
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
@ -24,8 +25,8 @@
using namespace tensorrt_llm::common; using namespace tensorrt_llm::common;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/gptKernels.h"
#include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/iTensor.h"
@ -25,8 +26,8 @@
namespace tc = tensorrt_llm::common; namespace tc = tensorrt_llm::common;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -64,4 +65,5 @@ template <typename MaskDataType>
void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream); void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,14 +14,15 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/banBadWords.h" #include "tensorrt_llm/kernels/banBadWords.h"
using namespace tensorrt_llm::common; using namespace tensorrt_llm::common;
using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::runtime;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream); SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,13 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/common.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
cudaStream_t stream); cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,14 +14,15 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/banRepeatNgram.h" #include "tensorrt_llm/kernels/banRepeatNgram.h"
using namespace tensorrt_llm::common; using namespace tensorrt_llm::common;
using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::runtime;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -178,4 +179,4 @@ INVOKE_BAN_REPEAT_NGRAM(__nv_bfloat16)
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/decodingCommon.h"
#include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/common.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -34,4 +35,5 @@ void invokeBanRepeatNgram(T* logits, runtime::TokenIdType const** output_ids_buf
runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream); runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,13 +14,14 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/beamSearchKernels.h" #include "tensorrt_llm/kernels/beamSearchKernels.h"
using namespace tensorrt_llm::common; using namespace tensorrt_llm::common;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -355,4 +356,5 @@ template void printLogProbs<float>(float const* x, int const nBS, int const nBMI
template void printLogProbs<half>(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV); template void printLogProbs<half>(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/decodingCommon.h"
#include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK #include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK
@ -22,8 +23,8 @@
#define BEAM_SEARCH_DEBUG 0 #define BEAM_SEARCH_DEBUG 0
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
static size_t constexpr kMaxBeamWidth = 1024; // Max beam width supported in TRT-LLM now static size_t constexpr kMaxBeamWidth = 1024; // Max beam width supported in TRT-LLM now
@ -88,7 +89,7 @@ struct BeamHypotheses
// Pointers related to beam search process, they are initialized in those two functions: // Pointers related to beam search process, they are initialized in those two functions:
// [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward // [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
bool* batchDones{nullptr}; // [BS] %% self.beam_hyps_is_done whether a whole batch is finished bool* batchDones{nullptr}; // [BS] %% self.beam_hyps_is_done whether a whole batch is finished
FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished ::tensorrt_llm::kernels::FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished
// Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer<T>::prepareIdsPtrs // Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer<T>::prepareIdsPtrs
int** outputIdsPtr{nullptr}; // [BS][BM, MSL] %% self.output_ids int** outputIdsPtr{nullptr}; // [BS][BM, MSL] %% self.output_ids
@ -131,11 +132,11 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream); runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, __global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates, ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, __global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates, ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS, __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
@ -219,4 +220,5 @@ void printLogProbs(float const* x, int const nBS, int const nBMIn, int const nBM
#endif #endif
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 1024, true);
#endif // FAST_BUILD #endif // FAST_BUILD
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 128, true);
#endif // FAST_BUILD #endif // FAST_BUILD
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,13 +15,15 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
// Skip V1 kernels if beam_width > kMaxBeamWidthForV1 // Skip V1 kernels if beam_width > kMaxBeamWidthForV1
INSTANTIATE_BEAM_SEARCH(float, 16, true); INSTANTIATE_BEAM_SEARCH(float, 16, true);
INSTANTIATE_BEAM_SEARCH(half, 16, true); INSTANTIATE_BEAM_SEARCH(half, 16, true);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 256, true);
#endif // FAST_BUILD #endif // FAST_BUILD
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 32, true);
#endif // FAST_BUILD #endif // FAST_BUILD
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
INSTANTIATE_BEAM_SEARCH(float, 4, false); INSTANTIATE_BEAM_SEARCH(float, 4, false);
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 4, true);
INSTANTIATE_BEAM_SEARCH(half, 4, false); INSTANTIATE_BEAM_SEARCH(half, 4, false);
INSTANTIATE_BEAM_SEARCH(half, 4, true); INSTANTIATE_BEAM_SEARCH(half, 4, true);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 512, true);
#endif // FAST_BUILD #endif // FAST_BUILD
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 64, true);
#endif // FAST_BUILD #endif // FAST_BUILD
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,9 +15,10 @@
*/ */
#include "beamSearchKernelsTemplate.h" #include "beamSearchKernelsTemplate.h"
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
INSTANTIATE_BEAM_SEARCH(float, 8, false); INSTANTIATE_BEAM_SEARCH(float, 8, false);
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 8, true);
INSTANTIATE_BEAM_SEARCH(half, 8, false); INSTANTIATE_BEAM_SEARCH(half, 8, false);
INSTANTIATE_BEAM_SEARCH(half, 8, true); INSTANTIATE_BEAM_SEARCH(half, 8, true);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -18,11 +18,13 @@
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050) #elif (CUDART_VERSION >= 11050)
#include <cub/cub.cuh> #include <cub/cub.cuh>
#else #else
#include "3rdparty/cub/cub.cuh" #include "3rdparty/cub/cub.cuh"
#endif #endif
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/stringUtils.h"
@ -31,8 +33,8 @@
using namespace tensorrt_llm::common; using namespace tensorrt_llm::common;
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -731,4 +733,5 @@ void beamSearchKernelLauncher(
T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream); T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,12 +14,12 @@
* limitations under the License. * limitations under the License.
*/ */
#include "buildRelativeAttentionBiasKernel.h"
#include "tensorrt_llm/common/config.h"
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include "buildRelativeAttentionBiasKernel.h" TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace kernels namespace kernels
{ {
@ -99,4 +99,5 @@ template void invokeBuildRelativeAttentionBias<__nv_bfloat16>(__nv_bfloat16* rel
#endif #endif
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,10 +17,11 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm TRTLLM_NAMESPACE_BEGIN
{
namespace kernels namespace kernels
{ {
@ -30,4 +31,5 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, T const* relat
cudaStream_t stream); cudaStream_t stream);
} // namespace kernels } // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -19,12 +19,15 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include <cub/block/block_load.cuh> #include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh> #include <cub/block/block_store.cuh>
#include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h" #include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h"
namespace tensorrt_llm::kernels::causal_conv1d TRTLLM_NAMESPACE_BEGIN
namespace kernels::causal_conv1d
{ {
template <int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_> template <int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
@ -490,4 +493,6 @@ template void causal_conv1d_update_cuda<float, float>(ConvParamsBase& params, cu
template void causal_conv1d_update_cuda<half, half>(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda<half, half>(ConvParamsBase& params, cudaStream_t stream);
template void causal_conv1d_update_cuda<nv_bfloat16, nv_bfloat16>(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda<nv_bfloat16, nv_bfloat16>(ConvParamsBase& params, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::causal_conv1d } // namespace kernels::causal_conv1d
TRTLLM_NAMESPACE_END

View File

@ -20,11 +20,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
namespace tensorrt_llm::kernels::causal_conv1d TRTLLM_NAMESPACE_BEGIN
namespace kernels::causal_conv1d
{ {
#define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError()) #define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError())
@ -214,4 +217,6 @@ void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream);
template <typename input_t, typename weight_t> template <typename input_t, typename weight_t>
void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::causal_conv1d } // namespace kernels::causal_conv1d
TRTLLM_NAMESPACE_END

View File

@ -13,13 +13,16 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
#include "tensorrt_llm/kernels/quantization.cuh" #include "tensorrt_llm/kernels/quantization.cuh"
#include <cooperative_groups.h> #include <cooperative_groups.h>
namespace tensorrt_llm::kernels::ar_fusion TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{ {
template <int NRanks> template <int NRanks>
struct SyncComm struct SyncComm
@ -818,4 +821,6 @@ void allreduce_fusion_op(AllReduceFusionParams const& params)
DISPATCH_RANKS(16); DISPATCH_RANKS(16);
TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!"); TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!");
} }
}; // namespace tensorrt_llm::kernels::ar_fusion }; // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -15,16 +15,19 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/kernels/quantization.h"
#include "tensorrt_llm/runtime/ipcUtils.h" #include "tensorrt_llm/runtime/ipcUtils.h"
namespace tensorrt_llm::kernels::ar_fusion TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{ {
template <typename DType> template <typename DType>
struct ElemsPerAccess; struct ElemsPerAccess;
@ -139,4 +142,6 @@ struct AllReduceFusionParams
}; };
void allreduce_fusion_op(AllReduceFusionParams const& params); void allreduce_fusion_op(AllReduceFusionParams const& params);
} // namespace tensorrt_llm::kernels::ar_fusion } // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -13,9 +13,12 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
namespace tensorrt_llm::kernels::ar_fusion TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{ {
__global__ void lamport_initialize_kernel(float* ptr, int size) __global__ void lamport_initialize_kernel(float* ptr, int size)
@ -94,4 +97,6 @@ void** Workspace::get_workspace()
{ {
return reinterpret_cast<void**>(m_workspace); return reinterpret_cast<void**>(m_workspace);
} }
}; // namespace tensorrt_llm::kernels::ar_fusion }; // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -16,11 +16,14 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
#include "tensorrt_llm/runtime/ipcUtils.h" #include "tensorrt_llm/runtime/ipcUtils.h"
namespace tensorrt_llm::kernels::ar_fusion TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion
{ {
class Workspace class Workspace
@ -41,4 +44,6 @@ private:
}; };
void lamport_initialize(void* ptr, int bytes, cudaStream_t stream); void lamport_initialize(void* ptr, int bytes, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::ar_fusion } // namespace kernels::ar_fusion
TRTLLM_NAMESPACE_END

View File

@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
@ -25,7 +26,9 @@
#include <tuple> #include <tuple>
#include <type_traits> #include <type_traits>
namespace tensorrt_llm::kernels TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
using tensorrt_llm::common::divUp; using tensorrt_llm::common::divUp;
@ -1632,4 +1635,6 @@ void customLowPrecisionAllReduce(
sync_check_cuda_error(stream); sync_check_cuda_error(stream);
} }
} // namespace tensorrt_llm::kernels } // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/customAllReduceKernels.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
@ -24,7 +25,9 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <vector> #include <vector>
namespace tensorrt_llm::kernels TRTLLM_NAMESPACE_BEGIN
namespace kernels
{ {
constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8; constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8;
@ -119,4 +122,6 @@ void customLowPrecisionAllReduce(
kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream); kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream);
int32_t max_workspace_size_lowprecision(int32_t tp_size); int32_t max_workspace_size_lowprecision(int32_t tp_size);
} // namespace tensorrt_llm::kernels } // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "mnnvlAllreduceKernels.h" #include "mnnvlAllreduceKernels.h"
#include "tensorrt_llm/common/config.h"
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
@ -31,7 +32,9 @@
#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh"
namespace tensorrt_llm::kernels::mnnvl TRTLLM_NAMESPACE_BEGIN
namespace kernels::mnnvl
{ {
using tensorrt_llm::common::isNegZero; using tensorrt_llm::common::isNegZero;
@ -1029,4 +1032,6 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params)
} }
} }
} // namespace tensorrt_llm::kernels::mnnvl } // namespace kernels::mnnvl
TRTLLM_NAMESPACE_END

View File

@ -16,11 +16,13 @@
#ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H #ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
#define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H #define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
#include "tensorrt_llm/common/config.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
#include <cstdint> #include <cstdint>
namespace tensorrt_llm::kernels::mnnvl TRTLLM_NAMESPACE_BEGIN
namespace kernels::mnnvl
{ {
/** /**
@ -66,6 +68,7 @@ struct AllReduceFusionParams
void oneshotAllreduceFusionOp(AllReduceFusionParams const& params); void oneshotAllreduceFusionOp(AllReduceFusionParams const& params);
void twoshotAllreduceFusionOp(AllReduceFusionParams const& params); void twoshotAllreduceFusionOp(AllReduceFusionParams const& params);
} // namespace tensorrt_llm::kernels::mnnvl } // namespace kernels::mnnvl
TRTLLM_NAMESPACE_END
#endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H #endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H

View File

@ -13,13 +13,16 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh"
#include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h" #include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h"
#include "tensorrt_llm/kernels/quantization.cuh" #include "tensorrt_llm/kernels/quantization.cuh"
#include <cooperative_groups.h> #include <cooperative_groups.h>
namespace tensorrt_llm::kernels::ar_fusion::moe TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion::moe
{ {
template <int NRanks> template <int NRanks>
struct LamportComm struct LamportComm
@ -770,4 +773,6 @@ void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& par
#undef MOE_FINALIZE_DISPATCH1 #undef MOE_FINALIZE_DISPATCH1
} }
}; // namespace tensorrt_llm::kernels::ar_fusion::moe }; // namespace kernels::ar_fusion::moe
TRTLLM_NAMESPACE_END

View File

@ -15,16 +15,19 @@
*/ */
#pragma once #pragma once
#include "tensorrt_llm/common/assert.h"
#include <NvInferRuntime.h> #include <NvInferRuntime.h>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/kernels/quantization.h"
#include "tensorrt_llm/runtime/ipcUtils.h" #include "tensorrt_llm/runtime/ipcUtils.h"
namespace tensorrt_llm::kernels::ar_fusion::moe TRTLLM_NAMESPACE_BEGIN
namespace kernels::ar_fusion::moe
{ {
static constexpr int kElemsPerAccess = 8; static constexpr int kElemsPerAccess = 8;
static constexpr int kOneShotMaxToken = 128; static constexpr int kOneShotMaxToken = 128;
@ -102,4 +105,6 @@ struct MoeFinalizeAllReduceFusionParams : public AllReduceFusionParams
void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params); void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params);
} // namespace tensorrt_llm::kernels::ar_fusion::moe } // namespace kernels::ar_fusion::moe
TRTLLM_NAMESPACE_END

Some files were not shown because too many files have changed in this diff Show More