TensorRT-LLMs/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePluginProfiler.cpp
yunruis 30c5b4183a
refactoring: port customized kernels with public cutlass version (#5027)
Signed-off-by: yunruis 

Merge this to unblock others since the full CI has been run through
2025-06-13 16:19:31 +08:00

129 lines
4.4 KiB
C++

/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "gemmAllReducePlugin.h"
#include "tensorrt_llm/common/dataType.h"
#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
#include "tensorrt_llm/plugins/common/pluginUtils.h"
namespace tc = tensorrt_llm::common;
namespace tensorrt_llm::plugins
{
void GemmAllReducePluginProfiler::serializeToOwnFile(GemmIdCore gemmId)
{
std::vector<char> file_buf(getSerializationSize(gemmId));
char* begin = file_buf.data();
char* end = file_buf.data();
serialize(end, gemmId);
assert(end == begin + file_buf.size());
auto fileName = getCacheFileName(gemmId);
std::ofstream file(fileName, std::ios::binary);
TLLM_CHECK(file.is_open());
file.write(begin, file_buf.size());
file.flush();
file.close();
}
void GemmAllReducePluginProfiler::deserializeFromOwnFile(GemmIdCore gemmId, GemmDims problemShape)
{
auto fileName = getCacheFileName(gemmId);
std::ifstream file(fileName, std::ios::binary);
TLLM_CHECK(file.is_open());
file.seekg(0, std::ios::end);
std::streamsize size = file.tellg();
TLLM_CHECK(size > 0);
file.seekg(0, std::ios::beg);
std::vector<char> file_buf(size);
file.read(file_buf.data(), size);
file.close();
char const* begin = const_cast<char const*>(file_buf.data());
char const* end = begin;
deserialize(end, problemShape, gemmId);
assert(end == begin + size);
}
std::string GemmAllReducePluginProfiler::getCacheFileName(GemmIdCore gemmId)
{
std::stringstream fileName;
fileName << "/tmp/gemm-AR";
fileName << "-n" << std::to_string(gemmId.n);
fileName << "-k" << std::to_string(gemmId.k);
fileName << "-" << tc::getDtypeString(gemmId.dtype);
fileName << ".prof_cache";
return fileName.str();
}
void GemmAllReducePluginProfiler::runTactic(int m, int n, int k,
cutlass_kernels::GemmAllReduceImplInterface::LaunchConfig const& tactic, char* workspace,
cudaStream_t const& stream)
{
const size_t dtype_size = tc::getDTypeSize(mType);
char* inputA = workspace;
char* inputB = inputA + m * k * dtype_size;
char* outputD = inputB + n * k * dtype_size;
char* inputSFA = outputD + m * n * dtype_size;
char* inputSFB = inputSFA + m * k * dtype_size;
std::set<int> tpGroup = {0};
// Run on single-GPU
cutlass_kernels::GemmAllReduceImplInterface::ProblemArgs args;
args.argProblemShape(m, n, k, 1)
.argA((void*) inputA)
.argB((void*) inputB)
.argD((void*) outputD, /*output_mc=*/nullptr)
.argAScale((void*) inputSFA)
.argBScale((void*) inputSFB)
.argRanks(0, tpGroup)
.argAlpha(1.f)
.argBeta(0.f) // no bias
.argLaunchConfig(tactic);
TLLM_CHECK(mRunner != nullptr);
mRunner->run(args, stream);
}
void GemmAllReducePluginProfiler::computeTmpSize(size_t maxM, size_t n, size_t k)
{
TLLM_CHECK(maxM != 0);
TLLM_CHECK(n != 0);
TLLM_CHECK(k != 0);
// mType refers to the output data type
// WARNING: This code assumes that the output precision is >= to input precision
const size_t dtype_size = tc::getDTypeSize(mType);
size_t bytes = 0;
bytes += maxM * k * dtype_size; // A
bytes += n * k * dtype_size; // B
// No C
// Note that D is typically IPC, however, when tuning GEMM we need it to run on single GPU
bytes += maxM * n * dtype_size; // D
// scale tensors for A & B - will at most be same size as A/B
bytes += maxM * k * dtype_size; // A
bytes += n * k * dtype_size; // B
setTmpWorkspaceSizeInBytes(bytes);
}
std::vector<cutlass_kernels::GemmAllReduceImplInterface::LaunchConfig> GemmAllReducePluginProfiler::getTactics(
int m, int n, int k) const
{
TLLM_CHECK(mRunner != nullptr);
return mRunner->getSupportedLaunchConfigs();
}
} // namespace tensorrt_llm::plugins