mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][fix] Introduce inline namespace to avoid symbol collision (#9541)
Signed-off-by: Yihan Wang <yihwang@nvidia.com>
This commit is contained in:
parent
af315d8ef1
commit
9df4dad3b6
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
|
||||
docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
|
||||
docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
|
||||
docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text
|
||||
cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
|
||||
cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -74,6 +74,7 @@ llm-test-workspace/
|
||||
cpp/include/tensorrt_llm/executor/version.h
|
||||
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
|
||||
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
|
||||
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
|
||||
.devcontainer/.env
|
||||
/examples/layer_wise_benchmarks/profiles/
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
|
||||
*AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -17,13 +18,16 @@
|
||||
*/
|
||||
|
||||
#include "utils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <random>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
namespace tensorrt_llm::benchmark
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace benchmark
|
||||
{
|
||||
|
||||
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
|
||||
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
|
||||
if (samples.size() < maxNumSamples)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
|
||||
"Dataset size %zu is smaller than given max_num_samples "
|
||||
"%d, max_num_samples will be ignored.\n",
|
||||
samples.size(), maxNumSamples);
|
||||
}
|
||||
return samples;
|
||||
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
|
||||
return os;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::benchmark
|
||||
} // namespace benchmark
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/executor/executor.h"
|
||||
|
||||
#include <cstdint>
|
||||
@ -29,7 +30,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace tensorrt_llm::benchmark
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace benchmark
|
||||
{
|
||||
|
||||
// using namespace tensorrt_llm::batch_manager;
|
||||
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
|
||||
|
||||
std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
|
||||
|
||||
} // namespace tensorrt_llm::benchmark
|
||||
} // namespace benchmark
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,8 +16,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
// Base class for algorithms
|
||||
struct Algorithm
|
||||
@ -29,4 +30,4 @@ struct Algorithm
|
||||
Algorithm& operator=(Algorithm const&) = delete;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,9 +17,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
//!
|
||||
@ -100,4 +104,6 @@ private:
|
||||
size_type mSize;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,14 +16,19 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
class DebugConfig
|
||||
{
|
||||
public:
|
||||
static bool isCheckDebugEnabled();
|
||||
};
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
|
||||
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
|
||||
@ -35,8 +40,8 @@ public:
|
||||
#define TLLM_CHECK(val) \
|
||||
do \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_WITH_INFO(val, info, ...) \
|
||||
@ -51,17 +56,17 @@ public:
|
||||
#define TLLM_CHECK_DEBUG(val) \
|
||||
do \
|
||||
{ \
|
||||
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
|
||||
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
|
||||
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) \
|
||||
|
||||
@ -17,9 +17,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "c10/util/intrusive_ptr.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <Python.h>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
// Adapted from pybind11's example implementation:
|
||||
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
|
||||
return c10::intrusive_ptr<T>::reclaim_copy(p);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
62
cpp/include/tensorrt_llm/common/config.h
Normal file
62
cpp/include/tensorrt_llm/common/config.h
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#ifndef TRTLLM_CONFIG_H
|
||||
#define TRTLLM_CONFIG_H
|
||||
|
||||
/**
|
||||
* \def TRTLLM_ABI_NAMESPACE
|
||||
* This macro is used to open an implicitly inline namespace block for the ABI version.
|
||||
* This macro can be overridden to change the ABI version.
|
||||
* The default ABI version is _v1.
|
||||
*/
|
||||
#ifndef TRTLLM_ABI_NAMESPACE
|
||||
#define TRTLLM_ABI_NAMESPACE _v1
|
||||
#endif
|
||||
|
||||
#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
|
||||
#define TRTLLM_ABI_NAMESPACE_BEGIN \
|
||||
inline namespace TRTLLM_ABI_NAMESPACE \
|
||||
{
|
||||
#endif
|
||||
|
||||
#ifndef TRTLLM_ABI_NAMESPACE_END
|
||||
#define TRTLLM_ABI_NAMESPACE_END }
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def TRTLLM_NAMESPACE_BEGIN
|
||||
* This macro is used to open a `tensorrt_llm::` namespace block, along with any
|
||||
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
|
||||
* This macro is defined by TensorRT-LLM and may not be overridden.
|
||||
*/
|
||||
#define TRTLLM_NAMESPACE_BEGIN \
|
||||
namespace tensorrt_llm \
|
||||
{ \
|
||||
TRTLLM_ABI_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* \def TRTLLM_NAMESPACE_END
|
||||
* This macro is used to close a `tensorrt_llm::` namespace block, along with any
|
||||
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
|
||||
* This macro is defined by TensorRT-LLM and may not be overridden.
|
||||
*/
|
||||
#define TRTLLM_NAMESPACE_END \
|
||||
TRTLLM_ABI_NAMESPACE_END \
|
||||
} /* end namespace tensorrt_llm */
|
||||
|
||||
#endif // TRTLLM_CONFIG_H
|
||||
@ -16,6 +16,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
#include <cuda_fp8.h>
|
||||
#include <cuda_runtime.h>
|
||||
@ -29,8 +31,8 @@
|
||||
#define USE_QGMMA
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
|
||||
const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
@ -14,12 +14,18 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/// @brief Populate the start and end profiling iteration indexes from the provided environment variables
|
||||
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
|
||||
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
|
||||
std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include "tensorrt_llm/common/cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
@ -49,7 +50,9 @@
|
||||
// this undef.
|
||||
#endif // WIN32
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
// workspace for cublas gemm : 32MB
|
||||
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
|
||||
DEFINE_MEMBER_CHECKER(qua)
|
||||
DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
/*
|
||||
* Macros compliant with TensorRT coding conventions
|
||||
|
||||
@ -16,11 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
#include <NvInferRuntime.h>
|
||||
#include <map>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
constexpr static size_t getDTypeSize(nvinfer1::DataType type)
|
||||
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
|
||||
return "";
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -22,9 +22,12 @@
|
||||
#include <string>
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
class Logger
|
||||
@ -125,12 +128,12 @@ private:
|
||||
|
||||
static inline std::string getPrefix(Level const level)
|
||||
{
|
||||
return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
|
||||
return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
|
||||
}
|
||||
|
||||
static inline std::string getPrefix(Level const level, int const rank)
|
||||
{
|
||||
return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
|
||||
return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
|
||||
}
|
||||
};
|
||||
|
||||
@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
|
||||
out << std::endl;
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#define TLLM_LOG(level, ...) \
|
||||
do \
|
||||
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
|
||||
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
|
||||
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
|
||||
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
@ -16,11 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/**
|
||||
@ -100,4 +104,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -480,4 +482,5 @@ public:
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#if ENABLE_BF16
|
||||
#include <cuda_bf16.h>
|
||||
#endif // ENABLE_BF16
|
||||
@ -28,7 +29,9 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
#if ENABLE_BF16
|
||||
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
|
||||
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
#include <array>
|
||||
@ -41,7 +42,9 @@
|
||||
tensorrt_llm::common::RequestSpecificException( \
|
||||
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/// @brief Enumeration of different error codes for request-specific exceptions
|
||||
@ -77,7 +80,8 @@ private:
|
||||
|
||||
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
|
||||
{
|
||||
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
|
||||
throw TllmException(
|
||||
file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
|
||||
}
|
||||
|
||||
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
|
||||
@ -102,4 +106,6 @@ private:
|
||||
RequestErrorCode mErrorCode;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <initializer_list>
|
||||
#include <string>
|
||||
@ -24,7 +26,9 @@
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
inline bool setThreadName(std::string const& name)
|
||||
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
|
||||
return std::find(c.begin(), c.end(), v) != c.end();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,7 +16,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
namespace detail
|
||||
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;
|
||||
|
||||
} // namespace arch
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,11 +17,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/executor/types.h"
|
||||
#include <cstdint>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
class FinishedState
|
||||
@ -308,4 +311,6 @@ template <typename T>
|
||||
void invokeScatterDecodingParams(
|
||||
T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,11 +17,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
class KVCacheIndex
|
||||
@ -53,4 +56,6 @@ private:
|
||||
UnderlyingType value;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,16 +14,18 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/runtime/iBuffer.h"
|
||||
|
||||
using namespace tensorrt_llm::runtime;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||
cudaStream_t stream);
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -2175,7 +2175,8 @@ def get_kernel_code(kspec, kname, lname):
|
||||
params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
|
||||
attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
|
||||
bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
|
||||
include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ''
|
||||
include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
|
||||
include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
|
||||
num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
|
||||
fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
|
||||
const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
|
||||
@ -2201,8 +2202,19 @@ def get_kernel_code(kspec, kname, lname):
|
||||
const int COMPUTE_REG_COUNT = {compute_reg_count};
|
||||
asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
|
||||
compute_reg_count=compute_reg_count)
|
||||
local_ns_open = ns_open if generate_cu_trtllm else ''
|
||||
local_ns_close = ns_close if generate_cu_trtllm else ''
|
||||
abi_ns_open = r"""
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels
|
||||
{
|
||||
// clang-format off
|
||||
"""
|
||||
abi_ns_close = r"""
|
||||
// clang-format on
|
||||
} // namespace kernels
|
||||
TRTLLM_NAMESPACE_END
|
||||
"""
|
||||
local_ns_open = abi_ns_open if generate_cu_trtllm else ''
|
||||
local_ns_close = abi_ns_close if generate_cu_trtllm else ''
|
||||
|
||||
tmp = dict(locals(), **kspec._asdict())
|
||||
|
||||
@ -3077,8 +3089,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
|
||||
def get_cubin_header(kernel_traits, specs_names):
|
||||
cubins = []
|
||||
cubin_lens = []
|
||||
launchers = []
|
||||
cubins_dict = {}
|
||||
cubin_lens_dict = {}
|
||||
launchers_dict = {}
|
||||
for kspec, fname, lname, kname in specs_names:
|
||||
if generate_cu_trtllm and not use_cubin_header(
|
||||
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
|
||||
@ -3282,11 +3296,11 @@ def get_cubin_header(kernel_traits, specs_names):
|
||||
if generate_cu_trtllm and lname != 'nullptr':
|
||||
launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
|
||||
lname=lname)
|
||||
if int(sm) in cubins_dict:
|
||||
if launcher not in cubins_dict[int(sm)]:
|
||||
cubins_dict[int(sm)].append(launcher)
|
||||
if int(sm) in launchers_dict:
|
||||
if launcher not in launchers_dict[int(sm)]:
|
||||
launchers_dict[int(sm)].append(launcher)
|
||||
else:
|
||||
cubins_dict[int(sm)] = [launcher]
|
||||
launchers_dict[int(sm)] = [launcher]
|
||||
elif 'mhca' in kname:
|
||||
code = '''\
|
||||
{{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
|
||||
@ -3309,17 +3323,33 @@ def get_cubin_header(kernel_traits, specs_names):
|
||||
else:
|
||||
metadata_v2 = ',\n'.join(metadata_v2)
|
||||
# Add macros to only include needed cubins during compilation.
|
||||
for sm in cubins_dict.keys():
|
||||
# Collect all SM versions from all dictionaries
|
||||
all_sms = sorted(
|
||||
set(
|
||||
list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
|
||||
list(launchers_dict.keys())))
|
||||
|
||||
for sm in all_sms:
|
||||
macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
|
||||
macro_end = f"#endif\n"
|
||||
|
||||
# Add cubin array declarations
|
||||
if sm in cubins_dict:
|
||||
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
|
||||
|
||||
# Add cubin length declarations
|
||||
if sm in cubin_lens_dict:
|
||||
cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
|
||||
|
||||
# Add launcher declarations
|
||||
if sm in launchers_dict:
|
||||
launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
|
||||
|
||||
unroll_config_v1 = ',\n'.join(unroll_config_v1)
|
||||
unroll_config_v2 = ',\n'.join(unroll_config_v2)
|
||||
cubins = '\n'.join(cubins)
|
||||
cubin_lens = '\n'.join(cubin_lens)
|
||||
launchers = '\n'.join(launchers)
|
||||
local_ns_open = ns_open
|
||||
local_ns_close = ns_close if generate_cu_trtllm else '}'
|
||||
launcher_line = '''
|
||||
@ -3431,7 +3461,157 @@ static const struct TestMetaV2
|
||||
|
||||
'''.format(**locals(), copyright=copyright)
|
||||
|
||||
return code
|
||||
# Generate header content (.h file)
|
||||
if "GENERATE_CUBIN" in os.environ:
|
||||
header_content = '''\
|
||||
{copyright}
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels{{
|
||||
|
||||
struct FusedMultiHeadAttentionKernelMetaInfoV2
|
||||
{{
|
||||
Data_type mDataTypeIn;
|
||||
Data_type mDataTypeOut;
|
||||
unsigned int mS;
|
||||
unsigned int mStepQ;
|
||||
unsigned int mStepKV;
|
||||
unsigned int mD;
|
||||
unsigned int mDV;
|
||||
unsigned int mSageBlockSizeQ;
|
||||
unsigned int mSageBlockSizeK;
|
||||
unsigned int mSageBlockSizeV;
|
||||
unsigned int mSM;
|
||||
const unsigned char* mCubin;
|
||||
unsigned int mCubinSize;
|
||||
const char* mFuncName;
|
||||
unsigned int mSharedMemBytes;
|
||||
unsigned int mThreadsPerCTA;
|
||||
unsigned int mUnrollStep;
|
||||
int mAttentionMaskType;
|
||||
int mAttentionInputLayout;
|
||||
bool mInterleaved;
|
||||
bool mFlashAttention;
|
||||
bool mWarpSpecialization;
|
||||
bool mFP32Accumulation;
|
||||
bool mAlibiSupported;
|
||||
bool mTiled;
|
||||
bool mEnableAttnLogitSoftcapping;
|
||||
bool mReturnSoftmaxStats;{launcher_line}
|
||||
}};
|
||||
|
||||
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
|
||||
extern const int sMhaKernelMetaInfosV2Size;
|
||||
|
||||
}} // namespace kernels
|
||||
TRTLLM_NAMESPACE_END
|
||||
'''.format(**locals(), copyright=copyright)
|
||||
# Generate source content (.cpp file)
|
||||
source_content = '''\
|
||||
{copyright}
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
{local_ns_open}
|
||||
|
||||
//--- Cubin Arrays
|
||||
{cubins}
|
||||
|
||||
//--- Cubin Lengths
|
||||
{cubin_lens}
|
||||
|
||||
{local_ns_close}
|
||||
|
||||
using namespace tensorrt_llm::kernels;
|
||||
|
||||
namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
|
||||
|
||||
class Fused_multihead_attention_params_v2;
|
||||
class Launch_params;
|
||||
|
||||
//--- Kernel Launchers
|
||||
{launchers}
|
||||
|
||||
// FIXME: These are duplicated declarations, we should remove them in the future.
|
||||
constexpr int32_t kSM_70 = 70;
|
||||
constexpr int32_t kSM_72 = 72;
|
||||
constexpr int32_t kSM_75 = 75;
|
||||
constexpr int32_t kSM_80 = 80;
|
||||
constexpr int32_t kSM_86 = 86;
|
||||
constexpr int32_t kSM_89 = 89;
|
||||
constexpr int32_t kSM_90 = 90;
|
||||
constexpr int32_t kSM_100 = 100;
|
||||
constexpr int32_t kSM_100f = 10100;
|
||||
constexpr int32_t kSM_103 = 103;
|
||||
constexpr int32_t kSM_120 = 120;
|
||||
constexpr int32_t kSM_121 = 121;
|
||||
|
||||
// FIXME: These are duplicated declarations, we should remove them in the future.
|
||||
enum Data_type
|
||||
{{
|
||||
DATA_TYPE_BOOL,
|
||||
DATA_TYPE_FP16,
|
||||
DATA_TYPE_FP32,
|
||||
DATA_TYPE_INT4,
|
||||
DATA_TYPE_INT8,
|
||||
DATA_TYPE_INT32,
|
||||
DATA_TYPE_BF16,
|
||||
DATA_TYPE_E2M1,
|
||||
DATA_TYPE_E4M3,
|
||||
DATA_TYPE_E5M2
|
||||
}};
|
||||
|
||||
struct FusedMultiHeadAttentionKernelMetaInfoV2
|
||||
{{
|
||||
Data_type mDataTypeIn;
|
||||
Data_type mDataTypeOut;
|
||||
unsigned int mS;
|
||||
unsigned int mStepQ;
|
||||
unsigned int mStepKV;
|
||||
unsigned int mD;
|
||||
unsigned int mDV;
|
||||
unsigned int mSageBlockSizeQ;
|
||||
unsigned int mSageBlockSizeK;
|
||||
unsigned int mSageBlockSizeV;
|
||||
unsigned int mSM;
|
||||
const unsigned char* mCubin;
|
||||
unsigned int mCubinSize;
|
||||
const char* mFuncName;
|
||||
unsigned int mSharedMemBytes;
|
||||
unsigned int mThreadsPerCTA;
|
||||
unsigned int mUnrollStep;
|
||||
int mAttentionMaskType;
|
||||
int mAttentionInputLayout;
|
||||
bool mInterleaved;
|
||||
bool mFlashAttention;
|
||||
bool mWarpSpecialization;
|
||||
bool mFP32Accumulation;
|
||||
bool mAlibiSupported;
|
||||
bool mTiled;
|
||||
bool mEnableAttnLogitSoftcapping;
|
||||
bool mReturnSoftmaxStats;{launcher_line}
|
||||
}};
|
||||
|
||||
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
|
||||
{metadata_v2}
|
||||
}};
|
||||
|
||||
extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
|
||||
}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
|
||||
'''.format(**locals(), copyright=copyright)
|
||||
else:
|
||||
# Non-GENERATE_CUBIN mode: use old behavior
|
||||
header_content = code
|
||||
source_content = None
|
||||
|
||||
return header_content, source_content
|
||||
|
||||
|
||||
# This is used to add some kernels running in cubins for passing CI cases.
|
||||
@ -3449,9 +3629,20 @@ def modify_cubin_header(cubin_header):
|
||||
return result
|
||||
|
||||
target = "#ifndef EXCLUDE_SM_80"
|
||||
addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
|
||||
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
|
||||
result = add_kernel_line(result, target, addition)
|
||||
addition_cubin_array = """
|
||||
#ifndef EXCLUDE_SM_80
|
||||
extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
|
||||
#endif
|
||||
"""
|
||||
addition_cubin_length = """
|
||||
#ifndef EXCLUDE_SM_80
|
||||
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
|
||||
#endif
|
||||
"""
|
||||
# Add cubin array and length into there corresponding sections.
|
||||
result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
|
||||
result = add_kernel_line(result, "//--- Cubin Lengths",
|
||||
addition_cubin_length)
|
||||
|
||||
def modify_kernel_line(result, target, new_line):
|
||||
lines = result.split('\n')
|
||||
@ -3534,13 +3725,22 @@ def generate_files(specs_names):
|
||||
output = output.decode('utf-8').strip()
|
||||
# this gives: kname, smem bytes, threads_per_cta, loop_step
|
||||
kernel_traits = [traits.split() for traits in output.splitlines()]
|
||||
cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
|
||||
# Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
|
||||
# To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
|
||||
cubin_header, cubin_source = get_cubin_header(kernel_traits,
|
||||
valid_specs_names)
|
||||
if generate_cu_trtllm:
|
||||
cubin_header = modify_cubin_header(cubin_header)
|
||||
cubin_source = modify_cubin_header(cubin_source)
|
||||
|
||||
# Write fmha_cubin.h file
|
||||
with open('./generated/fmha_cubin.h', 'w') as f:
|
||||
f.write(cubin_header)
|
||||
|
||||
# Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
|
||||
if cubin_source is not None:
|
||||
with open('./generated/fmha_cubin.cpp', 'w') as f:
|
||||
f.write(cubin_source)
|
||||
|
||||
|
||||
def enumerate_hgmma_tma_kernels(specs, sm=90):
|
||||
specs.append(
|
||||
|
||||
@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
namespace tensorrt_llm {
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels {
|
||||
'''
|
||||
|
||||
@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
|
||||
'''
|
||||
|
||||
TEMPLATE_EPILOGUE = '''}
|
||||
}
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
'''
|
||||
|
||||
D = defaultdict(list)
|
||||
|
||||
@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels
|
||||
{
|
||||
// clang-format off
|
||||
@ -96,7 +98,7 @@ namespace kernels
|
||||
cpp_file_suffex_text = R"""
|
||||
// clang-format on
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
"""
|
||||
|
||||
cubin_meta_info_struct_prefix_text = R"""
|
||||
|
||||
@ -27,7 +27,7 @@ bool initCheckDebug()
|
||||
}
|
||||
} // namespace
|
||||
|
||||
bool DebugConfig::isCheckDebugEnabled()
|
||||
bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
|
||||
{
|
||||
static bool const debugEnabled = initCheckDebug();
|
||||
return debugEnabled;
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#include "attentionOp.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/memoryUtils.h"
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/common/quantization.h"
|
||||
@ -36,7 +37,9 @@
|
||||
#include <nccl.h>
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
namespace tensorrt_llm::common::op
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::op
|
||||
{
|
||||
|
||||
class AttentionOp
|
||||
@ -543,4 +546,6 @@ private:
|
||||
UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common::op
|
||||
} // namespace common::op
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cublasVersionCheck.h"
|
||||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
@ -24,8 +25,8 @@
|
||||
#error CUDART_VERSION Undefined!
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t
|
||||
|
||||
} // namespace common
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <cublasLt.h>
|
||||
#include <cublas_v2.h>
|
||||
@ -24,8 +25,8 @@
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -185,4 +186,4 @@ public:
|
||||
|
||||
} // namespace common
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
|
||||
#endif // ENABLE_BF16
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
// Operator definitions intentionally in global namespace
|
||||
namespace
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <assert.h>
|
||||
#include <cstdlib>
|
||||
@ -28,8 +29,8 @@
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
static __host__ __device__ int hash(int val)
|
||||
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
|
||||
#define dllOpen(name) LoadLibrary("nv" name ".dll")
|
||||
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
|
||||
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
|
||||
@ -29,6 +30,7 @@
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <cuda.h>
|
||||
@ -36,7 +38,9 @@
|
||||
#include <cstdio>
|
||||
#include <mutex>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
|
||||
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
|
||||
return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#ifndef CUDA_DRIVER_WRAPPER_H
|
||||
#define CUDA_DRIVER_WRAPPER_H
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
@ -25,7 +26,9 @@
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
class CUDADriverWrapper
|
||||
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
/*
|
||||
* Macros compliant with TensorRT coding conventions
|
||||
*/
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
@ -24,8 +25,8 @@
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
#ifdef ENABLE_FP8
|
||||
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);
|
||||
|
||||
#endif // ENABLE_FP8
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/cudaProfilerUtils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include <cstdint>
|
||||
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
|
||||
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
|
||||
return std::make_pair(profileIterIdxs, stopIterIdxs);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -25,9 +25,10 @@
|
||||
#if ENABLE_BF16
|
||||
#include <cuda_bf16.h>
|
||||
#endif
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
|
||||
@ -25,7 +26,9 @@
|
||||
using tensorrt_llm::kernels::AllReduceFusionOp;
|
||||
using tensorrt_llm::kernels::AllReduceStrategyType;
|
||||
|
||||
namespace tensorrt_llm::utils::customAllReduceUtils
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace utils::customAllReduceUtils
|
||||
{
|
||||
|
||||
constexpr size_t NUM_POINTERS_PER_RANK = 7;
|
||||
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
|
||||
{90, AllReduceBestStrategyTableSM90},
|
||||
{100, AllReduceBestStrategyTableSM100},
|
||||
};
|
||||
} // namespace tensorrt_llm::utils::customAllReduceUtils
|
||||
} // namespace utils::customAllReduceUtils
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
|
||||
#include "envUtils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
@ -25,7 +26,9 @@
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::optional<int32_t> getIntEnv(char const* name)
|
||||
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
|
||||
return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,13 +16,16 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
// Useful when you want to inject some debug code controllable with env var.
|
||||
std::optional<int32_t> getIntEnv(char const* name);
|
||||
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
|
||||
|
||||
bool getEnvEplbForceGdrcopy();
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
|
||||
#define TRTLLM_CUDA_LAMPORT_UTILS_CUH
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <array>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
@ -29,7 +30,9 @@
|
||||
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
|
||||
@ -279,6 +282,7 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH
|
||||
|
||||
@ -15,12 +15,15 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
Logger::Logger()
|
||||
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
|
||||
thread_local Logger instance;
|
||||
return &instance;
|
||||
}
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,10 +16,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,11 +14,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "mcastDevMemUtils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <unordered_map>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
|
||||
{
|
||||
return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
|
||||
}
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,13 +15,17 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
// Avoid circular dependency
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
class McastDeviceMemory;
|
||||
}
|
||||
} // namespace tensorrt_llm::runtime
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
// Avoid circular dependency
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
|
||||
// Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
|
||||
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
|
||||
// information. Thus a derived pointer cannot used as the key.
|
||||
McastDeviceMemory* findMcastDevMemBuffer(void* ptr);
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/memoryUtils.h"
|
||||
@ -25,8 +26,8 @@
|
||||
|
||||
#include <sanitizer/asan_interface.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -961,4 +962,5 @@ void calcAlignedPointers(
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,13 +16,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
|
||||
@ -46,7 +47,9 @@
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm::common::nccl_util
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::nccl_util
|
||||
{
|
||||
|
||||
//==============================================================================
|
||||
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
|
||||
return std::make_pair(tensor, buffer);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nccl_util
|
||||
} // namespace common::nccl_util
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
@ -25,10 +25,13 @@
|
||||
#if defined(__clang__)
|
||||
#pragma clang diagnostic pop
|
||||
#endif
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace tensorrt_llm::common::nvtx
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::nvtx
|
||||
{
|
||||
inline nvtx3::color nextColor()
|
||||
{
|
||||
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nvtx
|
||||
} // namespace common::nvtx
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \
|
||||
::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
|
||||
#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)
|
||||
|
||||
@ -29,6 +29,7 @@
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
|
||||
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
|
||||
});
|
||||
return creator();
|
||||
}
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/workspace.h"
|
||||
|
||||
@ -37,7 +38,9 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace tensorrt_llm::common::op
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::op
|
||||
{
|
||||
|
||||
// Write values into buffer
|
||||
@ -178,7 +181,7 @@ struct hash
|
||||
|
||||
// for testing only
|
||||
void const* getCommSessionHandle();
|
||||
} // namespace tensorrt_llm::common::op
|
||||
} // namespace common::op
|
||||
|
||||
inline bool isBuilding()
|
||||
{
|
||||
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
|
||||
std::shared_ptr<cublasHandle_t> getCublasHandle();
|
||||
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#ifndef DEBUG
|
||||
|
||||
#define PLUGIN_CHECK(status) \
|
||||
|
||||
@ -16,14 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <float.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#else
|
||||
#include <cooperative_groups.h>
|
||||
#endif
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
@ -30,8 +31,8 @@
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#include "safetensors.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
@ -25,7 +26,9 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::common::safetensors
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::safetensors
|
||||
{
|
||||
using nvinfer1::DataType;
|
||||
|
||||
@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
|
||||
{
|
||||
return std::make_shared<SafeTensor>(filename);
|
||||
}
|
||||
} // namespace tensorrt_llm::common::safetensors
|
||||
} // namespace common::safetensors
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cstdint>
|
||||
@ -23,7 +24,9 @@
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace tensorrt_llm::common::safetensors
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::safetensors
|
||||
{
|
||||
class INdArray
|
||||
{
|
||||
@ -58,4 +61,6 @@ public:
|
||||
virtual ~ISafeTensor() = default;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common::safetensors
|
||||
} // namespace common::safetensors
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
|
||||
namespace tensorrt_llm::common::stl_utils
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::stl_utils
|
||||
{
|
||||
|
||||
template <typename TInputIt, typename TOutputIt, typename TBinOp>
|
||||
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::stl_utils
|
||||
} // namespace common::stl_utils
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstdarg>
|
||||
@ -23,7 +24,9 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
|
||||
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
|
||||
return values;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,13 +14,16 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <chrono>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "tensorrt_llm/common/timestampUtils.h"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::string getCurrentTimestamp()
|
||||
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,12 +14,17 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
|
||||
std::string getCurrentTimestamp();
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
#include <cinttypes>
|
||||
@ -26,7 +27,9 @@
|
||||
#endif
|
||||
#include <sstream>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
namespace
|
||||
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
|
||||
return mErrorCode;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,10 +14,13 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
// CuBLAS >= 12.9.1 requires 256-byte alignment.
|
||||
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
|
||||
return total;
|
||||
}
|
||||
|
||||
}; // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -18,10 +18,11 @@
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "cutlass/device_kernel.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace cutlass_extensions
|
||||
{
|
||||
|
||||
@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
|
||||
}
|
||||
|
||||
} // namespace cutlass_extensions
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -30,10 +30,11 @@
|
||||
#include "cutlass/epilogue/thread/linear_combination_relu.h"
|
||||
#include "cutlass/epilogue/thread/linear_combination_silu.h"
|
||||
#include "cutlass_extensions/epilogue/thread/fused_activations.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cutlass/epilogue/fusion/operations.hpp>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace cutlass_extensions
|
||||
{
|
||||
|
||||
@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
|
||||
};
|
||||
|
||||
} // namespace cutlass_extensions
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -24,10 +24,11 @@
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace cutlass_extensions
|
||||
{
|
||||
|
||||
@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
|
||||
}
|
||||
|
||||
} // namespace cutlass_extensions
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
|
||||
namespace
|
||||
{
|
||||
|
||||
[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
[[nodiscard]] bool executorConfigIsValid(
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
{
|
||||
// Make sure logic in this function matches fixExecutorConfig
|
||||
if (executorConfig.getEnableChunkedContext())
|
||||
@ -65,8 +66,8 @@ namespace
|
||||
return true;
|
||||
}
|
||||
|
||||
[[nodiscard]] ExecutorConfig fixExecutorConfig(
|
||||
ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
[[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
{
|
||||
// Make sure logic in this function matches executorConfigIsValid
|
||||
auto fixedExecutorConfig = executorConfig;
|
||||
@ -241,7 +242,7 @@ private:
|
||||
|
||||
void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
|
||||
std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
|
||||
ExecutorConfig const& executorConfig, bool isEncoder,
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
|
||||
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
|
||||
{
|
||||
auto const gpusPerNode = jsonConfig.getGpusPerNode();
|
||||
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model
|
||||
|
||||
Executor::Impl::Impl(std::filesystem::path const& modelPath,
|
||||
std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");
|
||||
|
||||
@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,
|
||||
|
||||
Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
|
||||
std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
|
||||
ModelType const modelType, ExecutorConfig const& executorConfig,
|
||||
ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
|
||||
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
|
||||
{
|
||||
auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
|
||||
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
|
||||
}
|
||||
|
||||
Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto const& worldConfig = model->getWorldConfig();
|
||||
auto const tp = worldConfig.getTensorParallelism();
|
||||
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
|
||||
shutdown();
|
||||
}
|
||||
|
||||
void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
|
||||
void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
|
||||
|
||||
std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto const gptModelType = [&executorConfig, &modelConfig]()
|
||||
{
|
||||
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw
|
||||
|
||||
std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto fixedExecutorConfig = ExecutorConfig{};
|
||||
fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
|
||||
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
|
||||
}
|
||||
|
||||
void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
|
||||
ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
|
||||
std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
|
||||
std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
|
||||
{
|
||||
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
|
||||
}
|
||||
|
||||
void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
|
||||
ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
|
||||
std::filesystem::path const& modelPath)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
@ -16,9 +16,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
|
||||
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
|
||||
cudaStream_t stream = 0);
|
||||
|
||||
}
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,12 +17,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
|
||||
int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
|
||||
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
|
||||
int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
|
||||
cudaStream_t const stream = 0);
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
#include "attentionMask.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
@ -24,8 +25,8 @@
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/gptKernels.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
@ -25,8 +26,8 @@
|
||||
|
||||
namespace tc = tensorrt_llm::common;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -64,4 +65,5 @@ template <typename MaskDataType>
|
||||
void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,14 +14,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/banBadWords.h"
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
using namespace tensorrt_llm::runtime;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
|
||||
SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
|
||||
cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,14 +14,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/banRepeatNgram.h"
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
using namespace tensorrt_llm::runtime;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -178,4 +179,4 @@ INVOKE_BAN_REPEAT_NGRAM(__nv_bfloat16)
|
||||
|
||||
} // namespace kernels
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,13 +16,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/kernels/decodingCommon.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -34,4 +35,5 @@ void invokeBanRepeatNgram(T* logits, runtime::TokenIdType const** output_ids_buf
|
||||
runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,13 +14,14 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/beamSearchKernels.h"
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -355,4 +356,5 @@ template void printLogProbs<float>(float const* x, int const nBS, int const nBMI
|
||||
template void printLogProbs<half>(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/decodingCommon.h"
|
||||
#include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK
|
||||
@ -22,8 +23,8 @@
|
||||
|
||||
#define BEAM_SEARCH_DEBUG 0
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
static size_t constexpr kMaxBeamWidth = 1024; // Max beam width supported in TRT-LLM now
|
||||
@ -88,7 +89,7 @@ struct BeamHypotheses
|
||||
// Pointers related to beam search process, they are initialized in those two functions:
|
||||
// [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
|
||||
bool* batchDones{nullptr}; // [BS] %% self.beam_hyps_is_done whether a whole batch is finished
|
||||
FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished
|
||||
::tensorrt_llm::kernels::FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished
|
||||
|
||||
// Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer<T>::prepareIdsPtrs
|
||||
int** outputIdsPtr{nullptr}; // [BS][BM, MSL] %% self.output_ids
|
||||
@ -131,11 +132,11 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
|
||||
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
|
||||
|
||||
__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
|
||||
|
||||
__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
|
||||
FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
|
||||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
|
||||
|
||||
__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
|
||||
@ -219,4 +220,5 @@ void printLogProbs(float const* x, int const nBS, int const nBMIn, int const nBM
|
||||
#endif
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 1024, true);
|
||||
#endif // FAST_BUILD
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 128, true);
|
||||
#endif // FAST_BUILD
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,13 +15,15 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
// Skip V1 kernels if beam_width > kMaxBeamWidthForV1
|
||||
INSTANTIATE_BEAM_SEARCH(float, 16, true);
|
||||
INSTANTIATE_BEAM_SEARCH(half, 16, true);
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 256, true);
|
||||
#endif // FAST_BUILD
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 32, true);
|
||||
#endif // FAST_BUILD
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
INSTANTIATE_BEAM_SEARCH(float, 4, false);
|
||||
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 4, true);
|
||||
INSTANTIATE_BEAM_SEARCH(half, 4, false);
|
||||
INSTANTIATE_BEAM_SEARCH(half, 4, true);
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 512, true);
|
||||
#endif // FAST_BUILD
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 64, true);
|
||||
#endif // FAST_BUILD
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,9 +15,10 @@
|
||||
*/
|
||||
|
||||
#include "beamSearchKernelsTemplate.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
INSTANTIATE_BEAM_SEARCH(float, 8, false);
|
||||
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 8, true);
|
||||
INSTANTIATE_BEAM_SEARCH(half, 8, false);
|
||||
INSTANTIATE_BEAM_SEARCH(half, 8, true);
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -18,11 +18,13 @@
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11050)
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
#else
|
||||
#include "3rdparty/cub/cub.cuh"
|
||||
#endif
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
@ -31,8 +33,8 @@
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -731,4 +733,5 @@ void beamSearchKernelLauncher(
|
||||
T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,12 +14,12 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "buildRelativeAttentionBiasKernel.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "buildRelativeAttentionBiasKernel.h"
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -99,4 +99,5 @@ template void invokeBuildRelativeAttentionBias<__nv_bfloat16>(__nv_bfloat16* rel
|
||||
#endif
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,10 +17,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -30,4 +31,5 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, T const* relat
|
||||
cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -19,12 +19,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cub/block/block_load.cuh>
|
||||
#include <cub/block/block_store.cuh>
|
||||
|
||||
#include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h"
|
||||
|
||||
namespace tensorrt_llm::kernels::causal_conv1d
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::causal_conv1d
|
||||
{
|
||||
|
||||
template <int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
|
||||
@ -490,4 +493,6 @@ template void causal_conv1d_update_cuda<float, float>(ConvParamsBase& params, cu
|
||||
template void causal_conv1d_update_cuda<half, half>(ConvParamsBase& params, cudaStream_t stream);
|
||||
template void causal_conv1d_update_cuda<nv_bfloat16, nv_bfloat16>(ConvParamsBase& params, cudaStream_t stream);
|
||||
|
||||
} // namespace tensorrt_llm::kernels::causal_conv1d
|
||||
} // namespace kernels::causal_conv1d
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -20,11 +20,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
namespace tensorrt_llm::kernels::causal_conv1d
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::causal_conv1d
|
||||
{
|
||||
|
||||
#define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError())
|
||||
@ -214,4 +217,6 @@ void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream);
|
||||
template <typename input_t, typename weight_t>
|
||||
void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream);
|
||||
|
||||
} // namespace tensorrt_llm::kernels::causal_conv1d
|
||||
} // namespace kernels::causal_conv1d
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -13,13 +13,16 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
|
||||
#include "tensorrt_llm/kernels/quantization.cuh"
|
||||
#include <cooperative_groups.h>
|
||||
|
||||
namespace tensorrt_llm::kernels::ar_fusion
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::ar_fusion
|
||||
{
|
||||
template <int NRanks>
|
||||
struct SyncComm
|
||||
@ -818,4 +821,6 @@ void allreduce_fusion_op(AllReduceFusionParams const& params)
|
||||
DISPATCH_RANKS(16);
|
||||
TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!");
|
||||
}
|
||||
}; // namespace tensorrt_llm::kernels::ar_fusion
|
||||
}; // namespace kernels::ar_fusion
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,16 +15,19 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/quantization.h"
|
||||
#include "tensorrt_llm/runtime/ipcUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels::ar_fusion
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::ar_fusion
|
||||
{
|
||||
template <typename DType>
|
||||
struct ElemsPerAccess;
|
||||
@ -139,4 +142,6 @@ struct AllReduceFusionParams
|
||||
};
|
||||
|
||||
void allreduce_fusion_op(AllReduceFusionParams const& params);
|
||||
} // namespace tensorrt_llm::kernels::ar_fusion
|
||||
} // namespace kernels::ar_fusion
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -13,9 +13,12 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
|
||||
|
||||
namespace tensorrt_llm::kernels::ar_fusion
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::ar_fusion
|
||||
{
|
||||
|
||||
__global__ void lamport_initialize_kernel(float* ptr, int size)
|
||||
@ -94,4 +97,6 @@ void** Workspace::get_workspace()
|
||||
{
|
||||
return reinterpret_cast<void**>(m_workspace);
|
||||
}
|
||||
}; // namespace tensorrt_llm::kernels::ar_fusion
|
||||
}; // namespace kernels::ar_fusion
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,11 +16,14 @@
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
|
||||
#include "tensorrt_llm/runtime/ipcUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels::ar_fusion
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::ar_fusion
|
||||
{
|
||||
|
||||
class Workspace
|
||||
@ -41,4 +44,6 @@ private:
|
||||
};
|
||||
|
||||
void lamport_initialize(void* ptr, int bytes, cudaStream_t stream);
|
||||
} // namespace tensorrt_llm::kernels::ar_fusion
|
||||
} // namespace kernels::ar_fusion
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
@ -25,7 +26,9 @@
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
using tensorrt_llm::common::divUp;
|
||||
@ -1632,4 +1635,6 @@ void customLowPrecisionAllReduce(
|
||||
sync_check_cuda_error(stream);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
|
||||
#include <NvInferRuntime.h>
|
||||
@ -24,7 +25,9 @@
|
||||
#include <cuda_fp16.h>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8;
|
||||
@ -119,4 +122,6 @@ void customLowPrecisionAllReduce(
|
||||
kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream);
|
||||
|
||||
int32_t max_workspace_size_lowprecision(int32_t tp_size);
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "mnnvlAllreduceKernels.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cooperative_groups.h>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
@ -31,7 +32,9 @@
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
|
||||
|
||||
namespace tensorrt_llm::kernels::mnnvl
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::mnnvl
|
||||
{
|
||||
|
||||
using tensorrt_llm::common::isNegZero;
|
||||
@ -1029,4 +1032,6 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params)
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::kernels::mnnvl
|
||||
} // namespace kernels::mnnvl
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,11 +16,13 @@
|
||||
#ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
|
||||
#define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cstdint>
|
||||
|
||||
namespace tensorrt_llm::kernels::mnnvl
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::mnnvl
|
||||
{
|
||||
|
||||
/**
|
||||
@ -66,6 +68,7 @@ struct AllReduceFusionParams
|
||||
|
||||
void oneshotAllreduceFusionOp(AllReduceFusionParams const& params);
|
||||
void twoshotAllreduceFusionOp(AllReduceFusionParams const& params);
|
||||
} // namespace tensorrt_llm::kernels::mnnvl
|
||||
} // namespace kernels::mnnvl
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
|
||||
|
||||
@ -13,13 +13,16 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
#include "tensorrt_llm/common/reduceKernelUtils.cuh"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h"
|
||||
#include "tensorrt_llm/kernels/quantization.cuh"
|
||||
#include <cooperative_groups.h>
|
||||
|
||||
namespace tensorrt_llm::kernels::ar_fusion::moe
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::ar_fusion::moe
|
||||
{
|
||||
template <int NRanks>
|
||||
struct LamportComm
|
||||
@ -770,4 +773,6 @@ void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& par
|
||||
#undef MOE_FINALIZE_DISPATCH1
|
||||
}
|
||||
|
||||
}; // namespace tensorrt_llm::kernels::ar_fusion::moe
|
||||
}; // namespace kernels::ar_fusion::moe
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,16 +15,19 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/quantization.h"
|
||||
#include "tensorrt_llm/runtime/ipcUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels::ar_fusion::moe
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels::ar_fusion::moe
|
||||
{
|
||||
static constexpr int kElemsPerAccess = 8;
|
||||
static constexpr int kOneShotMaxToken = 128;
|
||||
@ -102,4 +105,6 @@ struct MoeFinalizeAllReduceFusionParams : public AllReduceFusionParams
|
||||
|
||||
void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params);
|
||||
|
||||
} // namespace tensorrt_llm::kernels::ar_fusion::moe
|
||||
} // namespace kernels::ar_fusion::moe
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user