mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-4629] [feat] Step1: trtllm-gen kernels support sm103
Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
parent
25389c9fe2
commit
cca347e6b4
@ -111,6 +111,8 @@ constexpr int32_t kSM_86 = 86;
|
||||
constexpr int32_t kSM_89 = 89;
|
||||
constexpr int32_t kSM_90 = 90;
|
||||
constexpr int32_t kSM_100 = 100;
|
||||
constexpr int32_t kSM_100f = 10100;
|
||||
constexpr int32_t kSM_103 = 103;
|
||||
constexpr int32_t kSM_120 = 120;
|
||||
constexpr int32_t kSM_121 = 121;
|
||||
|
||||
|
||||
@ -218,8 +218,8 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
|
||||
gemmData.mInputBuffers.mPtrPerTokenSfA = mOptions.transposeMmaOutput ? perTokensSfB : perTokensSfA;
|
||||
gemmData.mInputBuffers.mPtrPerTokenSfB = mOptions.transposeMmaOutput ? perTokensSfA : perTokensSfB;
|
||||
gemmData.mInputBuffers.mPtrBias = ptrBias;
|
||||
gemmData.mInputBuffers.mPtrSwiGluAlpha = ptrAlpha;
|
||||
gemmData.mInputBuffers.mPtrSwiGluBeta = ptrBeta;
|
||||
gemmData.mInputBuffers.mPtrGatedActAlpha = ptrAlpha;
|
||||
gemmData.mInputBuffers.mPtrGatedActBeta = ptrBeta;
|
||||
gemmData.mInputBuffers.mPtrClampLimit = ptrClampLimit;
|
||||
|
||||
gemmData.mInputBuffers.mPtrRouteMap = routeMap;
|
||||
|
||||
@ -247,22 +247,47 @@ struct BatchedGemmData
|
||||
// The clamp limit for the accumulator before applying the activation.
|
||||
// Shape is [B].
|
||||
// Clamp is INF if nullptr.
|
||||
// When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb.
|
||||
// If applied on SwiGlu, it will be:
|
||||
//
|
||||
// x_glu = x_glu.clamp(min=None, max=limit)
|
||||
// x_linear = x_linear.clamp(min=-limit, max=limit)
|
||||
//
|
||||
// The given clamp limit applies to the dequantized values, so the order of operations would
|
||||
// look something like this:
|
||||
//
|
||||
// x0 = x0 * dqAb
|
||||
// x0 = clamp(x0, none, limit)
|
||||
// x0 = x0 * sigmoid(alpha * x0)
|
||||
// x1 = dqAb * x1
|
||||
// x1 = clamp(x1, -limit, limit)
|
||||
// out = qC * (x1 + beta) * x0
|
||||
//
|
||||
// Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp
|
||||
// limit and apply the clamping prior to dequantization:
|
||||
//
|
||||
// x0 = clamp(x0, none, limit / dqAb)
|
||||
// x0 = x0 * dqAb
|
||||
// x0 = x0 * sigmoid(alpha * x0)
|
||||
// x1 = clamp(x1, -limit / dqAb, limit / dqAb)
|
||||
// scaleC = dqAb * qC
|
||||
// beta' = beta / dqAb
|
||||
// out = scaleC * (x1 + beta') * x0
|
||||
//
|
||||
// Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case
|
||||
//
|
||||
float const* mPtrClampLimit{nullptr};
|
||||
|
||||
// The alpha and beta for SwiGlu.
|
||||
// The alpha and beta for SwiGlu or GeGlu.
|
||||
// gatedActivation <- (x0 + beta) * activation(x1, alpha)
|
||||
// Shape is [B].
|
||||
// Alpha is 1.f if nullptr.
|
||||
// Beta is 0.f if nullptr.
|
||||
// The formula:
|
||||
// The formula for SwiGlu (for GeGlu, replace sigmoid with phi):
|
||||
//
|
||||
// out_glu = x_glu * torch.sigmoid(alpha * x_glu) + (x_linear + beta)
|
||||
float const* mPtrSwiGluAlpha{nullptr};
|
||||
float const* mPtrSwiGluBeta{nullptr};
|
||||
// out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
|
||||
float const* mPtrGatedActAlpha{nullptr};
|
||||
float const* mPtrGatedActBeta{nullptr};
|
||||
|
||||
// Param is used when the kernel is configured with -routeAct true.
|
||||
// The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or
|
||||
@ -432,10 +457,48 @@ public:
|
||||
// Returns the number of available cubin configurations
|
||||
size_t getNumBatchedGemmConfigs() const;
|
||||
|
||||
// Returns the number of CTAs of the last launched kernel.
|
||||
int32_t getNumCtas() const
|
||||
// Returns the grid dimensions of the current kernel.
|
||||
std::tuple<int32_t, int32_t, int32_t> getGridDim(
|
||||
BatchedGemmOptions const& options, std::optional<int32_t> maxNumCtasInBatchDim = std::nullopt) const
|
||||
{
|
||||
return mNumCtas;
|
||||
bool const batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
|
||||
|
||||
int32_t numCtasBatch{0};
|
||||
// For normal BMM, mNumTokens == 0 and the number of CTAs is known to host.
|
||||
if (options.mIsStaticBatch)
|
||||
{
|
||||
for (int32_t bi = 0; bi < options.mNumBatches; ++bi)
|
||||
{
|
||||
numCtasBatch += batchM ? gemm::divUp(options.mBatchedM[bi], options.mTileM)
|
||||
: gemm::divUp(options.mBatchedN[bi], options.mTileN);
|
||||
}
|
||||
}
|
||||
// For MoE, mNumTokens != 0 and the number of CTAs is known only at runtime.
|
||||
// We launch maximally possible number of CTAs and use ptrNumNonExitingCtas to determine the
|
||||
// actual number of CTAs to run.
|
||||
else if ((options.mEnablesEarlyExit || options.mEnablesDelayedEarlyExit) && options.mNumTokens != 0)
|
||||
{
|
||||
assert(maxNumCtasInBatchDim.has_value()
|
||||
&& "maxNumCtasInBatchDim must be provided when options.mNumTokens != 0");
|
||||
numCtasBatch = maxNumCtasInBatchDim.value();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::invalid_argument("Invalid combination of options");
|
||||
}
|
||||
|
||||
int32_t const numCtasTile
|
||||
= batchM ? gemm::divUp(options.mN, options.mTileN) : gemm::divUp(options.mM, options.mTileM);
|
||||
int32_t const numCtasInner = options.mNumSlicesForSplitK;
|
||||
return std::make_tuple(numCtasBatch, numCtasTile, numCtasInner);
|
||||
}
|
||||
|
||||
// Returns the number of CTAs of the current kernel.
|
||||
int32_t getNumCtas(
|
||||
BatchedGemmOptions const& options, std::optional<int32_t> maxNumCtasInBatchDim = std::nullopt) const
|
||||
{
|
||||
auto [numCtasBatch, numCtasTile, numCtasInner] = getGridDim(options, maxNumCtasInBatchDim);
|
||||
return numCtasBatch * numCtasTile * numCtasInner;
|
||||
}
|
||||
|
||||
// Returns true if the configuration of the cubin can be executed for the given params.
|
||||
@ -453,10 +516,6 @@ private:
|
||||
|
||||
// Returns the size padded to the alignment
|
||||
size_t getSizePaddedToAlignment(size_t size, size_t alignment) const;
|
||||
|
||||
private:
|
||||
// Number of the CTAs of the last launched kernel.
|
||||
int32_t mNumCtas{0};
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -518,7 +577,7 @@ bool BatchedGemmInterface::isValidConfig(BatchedGemmConfig const& config, Batche
|
||||
auto options = getOptionsFromConfigAndData(config, data);
|
||||
|
||||
// Is Blackwell?
|
||||
bool isBlackwell = config.mSm == gemm::SmVersion::Sm100a;
|
||||
bool isBlackwell = gemm::isSmVersionBlackwell(config.mSm);
|
||||
|
||||
// Check options without modifications.
|
||||
return checkAndUpdateBatchedGemmOptions(options, isBlackwell,
|
||||
@ -629,46 +688,23 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
|
||||
}
|
||||
}
|
||||
|
||||
int32_t numCtaXy{0};
|
||||
if (options.mIsStaticBatch)
|
||||
{
|
||||
for (int32_t bi = 0; bi < options.mNumBatches; ++bi)
|
||||
{
|
||||
numCtaXy += batchM ? gemm::divUp(options.mBatchedM[bi], options.mTileM)
|
||||
: gemm::divUp(options.mBatchedN[bi], options.mTileN);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t maxNumCtasInBatchDim{numCtaXy};
|
||||
// For normal BMM, mNumTokens == 0 and the number of CTAs is known to host.
|
||||
// For MoE, mNumTokens != 0 and the number of CTAs is known only at runtime.
|
||||
// We launch maximally possible number of CTAs and use ptrNumNonExitingCtas to determine
|
||||
// the actual number of CTAs to run.
|
||||
if ((options.mEnablesEarlyExit || options.mEnablesDelayedEarlyExit) && options.mNumTokens != 0)
|
||||
{
|
||||
// Get maximum number of CTAs in batch dim.
|
||||
maxNumCtasInBatchDim = batchedGemmData.mProblemDimensions.mMaxNumCtasInTokenDim;
|
||||
}
|
||||
|
||||
auto const numCtaX = batchM ? maxNumCtasInBatchDim : gemm::divUp(options.mM, options.mTileM);
|
||||
auto const numCtaY = batchM ? gemm::divUp(options.mN, options.mTileN) : maxNumCtasInBatchDim;
|
||||
auto const numCtaZ = options.mNumSlicesForSplitK;
|
||||
mNumCtas = numCtaX * numCtaY * numCtaZ;
|
||||
|
||||
auto [numCtaBatch, numCtaTile, numCtaInner]
|
||||
= getGridDim(options, batchedGemmData.mProblemDimensions.mMaxNumCtasInTokenDim);
|
||||
auto kernelParams = KernelParamsSetup::setKernelParams(options, batchM, batchedGemmData.mInputBuffers.mPtrA,
|
||||
batchedGemmData.mInputBuffers.mPtrB, batchedGemmData.mOutputBuffers.mPtrC,
|
||||
batchedGemmData.mInputBuffers.mPtrSfA, batchedGemmData.mInputBuffers.mPtrSfB,
|
||||
batchedGemmData.mInputBuffers.mPtrPerTokenSfA, batchedGemmData.mInputBuffers.mPtrPerTokenSfB,
|
||||
batchedGemmData.mInputBuffers.mPtrBias, batchedGemmData.mOutputBuffers.mPtrSfC,
|
||||
batchedGemmData.mInputBuffers.mPtrScaleC, batchedGemmData.mInputBuffers.mPtrScaleGate,
|
||||
batchedGemmData.mInputBuffers.mPtrClampLimit, batchedGemmData.mInputBuffers.mPtrSwiGluAlpha,
|
||||
batchedGemmData.mInputBuffers.mPtrSwiGluBeta, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax,
|
||||
batchedGemmData.mInputBuffers.mPtrClampLimit, batchedGemmData.mInputBuffers.mPtrGatedActAlpha,
|
||||
batchedGemmData.mInputBuffers.mPtrGatedActBeta, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax,
|
||||
dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
|
||||
batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx,
|
||||
batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, maxNumCtasInBatchDim);
|
||||
batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, numCtaBatch);
|
||||
|
||||
// The size of the grid.
|
||||
std::vector<int32_t> grid{numCtaX, numCtaY, numCtaZ};
|
||||
std::vector<int32_t> grid = batchM ? std::vector<int32_t>{numCtaBatch, numCtaTile, numCtaInner}
|
||||
: std::vector<int32_t>{numCtaTile, numCtaBatch, numCtaInner};
|
||||
|
||||
#ifdef TLLM_GEN_EXPORT_INTERFACE
|
||||
CUmodule cuModule;
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
#include "GemmGatedActOptions.h"
|
||||
#include "GemmOptions.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#ifndef TLLM_GEN_EXPORT_INTERFACE
|
||||
@ -32,17 +33,19 @@
|
||||
if (!(cond)) \
|
||||
{ \
|
||||
printArgs(__VA_ARGS__); \
|
||||
printArgs("\n"); \
|
||||
return false; \
|
||||
}
|
||||
|
||||
#define TLLM_LOG_ERROR(...) TLLM_CHECK_ERROR(false, __VA_ARGS__)
|
||||
|
||||
#define TLLM_CHECK_ERROR_FMT(...) TLLM_CHECK_ERROR(false, __VA_ARGS__)
|
||||
#define TLLM_CHECK_ERROR_FMT(cond, ...) TLLM_CHECK_ERROR(cond, __VA_ARGS__)
|
||||
|
||||
#define TLLM_CHECK_WARNING(cond, ...) \
|
||||
if (!(cond)) \
|
||||
{ \
|
||||
printArgs(__VA_ARGS__); \
|
||||
printArgs("\n"); \
|
||||
return false; \
|
||||
}
|
||||
|
||||
@ -50,7 +53,7 @@
|
||||
|
||||
#define TLLM_LOG_INFO(...) TLLM_CHECK_WARNING(false, __VA_ARGS__)
|
||||
|
||||
#endif
|
||||
#endif // TLLM_GEN_EXPORT_INTERFACE
|
||||
|
||||
namespace batchedGemm
|
||||
{
|
||||
@ -95,11 +98,12 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
|
||||
bool useShuffledMatrixA, bool sliceK, gemm::SplitK splitK, bool transposeMmaOutput, int tileM, int tileN,
|
||||
int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule,
|
||||
bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps,
|
||||
bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
|
||||
int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, bool clampBeforeAct,
|
||||
std::vector<int> batchedM, std::vector<int> batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch,
|
||||
int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting, bool fusedAct,
|
||||
int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps)
|
||||
bool useTwoMmaWarps, std::optional<int32_t> sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB,
|
||||
tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler,
|
||||
gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector<int> batchedM, std::vector<int> batchedN,
|
||||
BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl,
|
||||
bool gridWaitForPrimaryRouting, bool fusedAct, int numRegsPerThreadNonEpilogueWarp,
|
||||
int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt)
|
||||
: gemmGatedAct::GemmGatedActOptions(
|
||||
gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA,
|
||||
dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs,
|
||||
@ -110,21 +114,22 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
|
||||
numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, patchF2fp,
|
||||
useShuffledMatrixA, sliceK, splitK, transposeMmaOutput, tileM, tileN, tileK, useUnrollLoop2xForMma,
|
||||
useCustomMmaSchedule, useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8, usePerTokenSfA,
|
||||
usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfLayoutA, sfLayoutB, sfLayoutC,
|
||||
sfReshapeFactor, tileScheduler),
|
||||
usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfBlockSizeA, sfLayoutA, sfLayoutB,
|
||||
sfLayoutC, sfReshapeFactor, tileScheduler),
|
||||
actType, clampBeforeAct)
|
||||
, mBatchedM(batchedM)
|
||||
, mBatchedN(batchedN)
|
||||
, mBatchMode(BatchMode(batchMode))
|
||||
, mNumBatches(numBatches)
|
||||
, mIsStaticBatch(isStaticBatch)
|
||||
, mNumTokens(numTokens)
|
||||
, mRouteImpl(routeImpl)
|
||||
, mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting)
|
||||
, mFusedAct(fusedAct)
|
||||
, mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting)
|
||||
, mIsStaticBatch(isStaticBatch)
|
||||
, mNumBatches(numBatches)
|
||||
, mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp)
|
||||
, mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp)
|
||||
, mNumRegsCastAWarps(numRegsCastAWarps)
|
||||
, mNumTokens(numTokens)
|
||||
, mRouteImpl(routeImpl)
|
||||
, mUseTmaOobOpt(useTmaOobOpt)
|
||||
{
|
||||
}
|
||||
|
||||
@ -134,28 +139,28 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
|
||||
std::vector<int> mBatchedN;
|
||||
// Whether batching M or N.
|
||||
BatchMode mBatchMode{BatchMode::BatchM};
|
||||
// Number of Gemm batches.
|
||||
int mNumBatches;
|
||||
|
||||
// Whether the batch size is static (i.e. known at kernel launch time).
|
||||
bool mIsStaticBatch{true};
|
||||
// Total number of tokens.
|
||||
int mNumTokens{32};
|
||||
// Whether load the input tokens and do routing.
|
||||
RouteImpl mRouteImpl{RouteImpl::NoRoute};
|
||||
// Whether to perform a fused gated activation.
|
||||
bool mFusedAct{false};
|
||||
// Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens,
|
||||
// ptrCtaIdxXyToBatchIdx, etc.. should wait on a grid dependency.
|
||||
bool mGridWaitForPrimaryRouting{true};
|
||||
|
||||
// Whether to perform a fused gated activation.
|
||||
bool mFusedAct{false};
|
||||
|
||||
// Whether the batch size is static (i.e. known at kernel launch time).
|
||||
bool mIsStaticBatch{true};
|
||||
// Number of Gemm batches.
|
||||
int mNumBatches;
|
||||
// Number of registers per thread for non-epilogue warps
|
||||
int mNumRegsPerThreadNonEpilogueWarp{0};
|
||||
// Number of registers per thread for epilogue warps
|
||||
int mNumRegsPerThreadEpilogueWarp{0};
|
||||
// Number of registers for the cast A warps.
|
||||
int mNumRegsCastAWarps{0};
|
||||
// Total number of tokens.
|
||||
int mNumTokens{32};
|
||||
// Whether load the input tokens and do routing.
|
||||
RouteImpl mRouteImpl{RouteImpl::NoRoute};
|
||||
// Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in
|
||||
// BatchedGemm/KernelParamsDecl.h.
|
||||
bool mUseTmaOobOpt{false};
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -165,6 +170,20 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
|
||||
{
|
||||
|
||||
bool isValid = true;
|
||||
if (options.mUseTmaOobOpt && !options.mUseTwoTmaLoadWarps)
|
||||
{
|
||||
if (updateOptions)
|
||||
{
|
||||
// Since any routing (mRouteAct != NoRoute) requires mUseTwoTmaLoadWarps == true.
|
||||
// Single TMA load warp is not the target use case for OOB optimization.
|
||||
options.mUseTmaOobOpt = false;
|
||||
}
|
||||
else if (!options.mUseTwoTmaLoadWarps)
|
||||
{
|
||||
TLLM_CHECK_ERROR(false, "TMA OOB optimization requires two TMA load warps.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (options.mFusedAct)
|
||||
{
|
||||
// ensure that we check the fused options as well
|
||||
@ -198,22 +217,19 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
|
||||
}
|
||||
}
|
||||
|
||||
for (int b = 0; b < options.mNumBatches; b++)
|
||||
if (batchM)
|
||||
{
|
||||
if (batchM)
|
||||
{
|
||||
TLLM_CHECK_ERROR(options.mN > 0 && options.mK > 0, "N and K must be larger than 0");
|
||||
TLLM_CHECK_ERROR(options.mN >= options.mTileN, "N must be equal or larger than TileN.");
|
||||
TLLM_CHECK_ERROR(options.mN % options.mTileN == 0, "N must be divisible by TileN.");
|
||||
TLLM_CHECK_ERROR(!options.mTransposeMmaOutput, "When batchM the MMA output has to be in row-major.");
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_CHECK_ERROR(options.mM > 0 && options.mK > 0, "M and K must be larger than 0");
|
||||
TLLM_CHECK_ERROR(options.mM >= options.mTileM, "N must be equal or larger than tileN.");
|
||||
TLLM_CHECK_ERROR(options.mM % options.mTileM == 0, "M must be divisible by TileM.");
|
||||
TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "When batchN the MMA output has to be in column-major.");
|
||||
}
|
||||
TLLM_CHECK_ERROR(options.mN > 0 && options.mK > 0, "N and K must be larger than 0");
|
||||
TLLM_CHECK_ERROR(options.mN >= options.mTileN, "N must be equal or larger than TileN.");
|
||||
TLLM_CHECK_ERROR(options.mN % options.mTileN == 0, "N must be divisible by TileN.");
|
||||
TLLM_CHECK_ERROR(!options.mTransposeMmaOutput, "When batchM the MMA output has to be in row-major.");
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_CHECK_ERROR(options.mM > 0 && options.mK > 0, "M and K must be larger than 0");
|
||||
TLLM_CHECK_ERROR(options.mM >= options.mTileM, "M must be equal or larger than TileM.");
|
||||
TLLM_CHECK_ERROR(options.mM % options.mTileM == 0, "M must be divisible by TileM.");
|
||||
TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "When batchN the MMA output has to be in column-major.");
|
||||
}
|
||||
|
||||
if (options.mUseDeepSeekFp8)
|
||||
@ -367,7 +383,8 @@ inline std::string dumpOptions(BatchedGemmOptions const& options)
|
||||
ss << "mFusedAct=" << options.mFusedAct << "," << std::endl;
|
||||
ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << "," << std::endl;
|
||||
ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << "," << std::endl;
|
||||
ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << std::endl;
|
||||
ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
|
||||
ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
@ -67,7 +67,12 @@ enum class ActType
|
||||
// beta' = beta / scaleAb, scaleC' = scaleC * scaleAb.
|
||||
//
|
||||
// GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0.
|
||||
SwiGlu
|
||||
SwiGlu,
|
||||
// For ActType == GeGlu, we use the simplified version
|
||||
// gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * phi(alpha * x1 * scaleGate)),
|
||||
// where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales,
|
||||
// beta' = beta / scaleAb, scaleC' = scaleC * scaleAb.
|
||||
GeGlu,
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -81,6 +86,7 @@ enum class ActType
|
||||
}
|
||||
|
||||
TLLM_ACT_TYPE_FUNCTION(SwiGlu)
|
||||
TLLM_ACT_TYPE_FUNCTION(GeGlu)
|
||||
|
||||
#undef TLLM_ACT_TYPE_FUNCTION
|
||||
|
||||
@ -91,6 +97,7 @@ inline std::string getActTypeName(ActType type)
|
||||
switch (type)
|
||||
{
|
||||
case ActType::SwiGlu: return "SwiGlu";
|
||||
case ActType::GeGlu: return "GeGlu";
|
||||
default: return "Unknown type";
|
||||
}
|
||||
}
|
||||
@ -179,7 +186,7 @@ inline std::string dumpOptions(GemmGatedActOptions const& options)
|
||||
ss << gemm::dumpOptions(options) << ", ";
|
||||
ss << "mActType="
|
||||
<< "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << ")," << std::endl;
|
||||
ss << "mClampLimit=" << options.mClampBeforeAct << "," << std::endl;
|
||||
ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
|
||||
@ -31,23 +32,30 @@
|
||||
#else
|
||||
#include <iostream>
|
||||
|
||||
template <typename T>
|
||||
void printArgs(T arg)
|
||||
{
|
||||
#ifdef TLLM_GEN_DEBUG
|
||||
std::cout << arg;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
void printArgs(T first, Args... args)
|
||||
{
|
||||
#ifdef TLLM_GEN_DEBUG
|
||||
std::cout << first;
|
||||
printArgs(first);
|
||||
if constexpr (sizeof...(args) > 0)
|
||||
{
|
||||
std::cout << " ";
|
||||
printArgs(", ");
|
||||
printArgs(args...);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define TLLM_CHECK_ERROR(cond, ...) \
|
||||
if (!(cond)) \
|
||||
{ \
|
||||
printArgs(__VA_ARGS__); \
|
||||
printArgs("\n"); \
|
||||
return false; \
|
||||
}
|
||||
|
||||
@ -59,6 +67,7 @@ void printArgs(T first, Args... args)
|
||||
if (!(cond)) \
|
||||
{ \
|
||||
printArgs(__VA_ARGS__); \
|
||||
printArgs("\n"); \
|
||||
return false; \
|
||||
}
|
||||
|
||||
@ -66,7 +75,7 @@ void printArgs(T first, Args... args)
|
||||
|
||||
#define TLLM_LOG_INFO(...) TLLM_CHECK_WARNING(false, __VA_ARGS__)
|
||||
|
||||
#endif
|
||||
#endif // TLLM_GEN_EXPORT_INTERFACE
|
||||
|
||||
namespace batchedGemm
|
||||
{
|
||||
@ -103,8 +112,9 @@ struct GemmOptions
|
||||
bool patchF2fp, bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM,
|
||||
int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule,
|
||||
bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB,
|
||||
bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB,
|
||||
tg::SfLayout sfLayoutC, int sfReshapeFactor, TileScheduler tileScheduler)
|
||||
bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, std::optional<int32_t> sfBlockSizeA,
|
||||
tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int sfReshapeFactor,
|
||||
TileScheduler tileScheduler)
|
||||
: mAllReduceAlgo{allReduceAlgo}
|
||||
, mBiasType{biasType}
|
||||
, mBlockK(blockK)
|
||||
@ -167,6 +177,7 @@ struct GemmOptions
|
||||
, mUseTmaStore{useTmaStore}
|
||||
, mUseTwoTmaLoadWarps{useTwoTmaLoadWarps}
|
||||
, mUseTwoMmaWarps{useTwoMmaWarps}
|
||||
, mSfBlockSizeA{sfBlockSizeA}
|
||||
, mSfLayoutA{sfLayoutA}
|
||||
, mSfLayoutB{sfLayoutB}
|
||||
, mSfLayoutC{sfLayoutC}
|
||||
@ -313,6 +324,8 @@ struct GemmOptions
|
||||
bool mUseTwoTmaLoadWarps{false};
|
||||
// Use two different warps for MMA tasks. Applicable only to DeepSeek FP8.
|
||||
bool mUseTwoMmaWarps{false};
|
||||
// Block size of A. For dtypeA == E2m1 and dtypeB == E4m3.
|
||||
std::optional<int32_t> mSfBlockSizeA{std::nullopt};
|
||||
// Scale factors layout for A.
|
||||
tg::SfLayout mSfLayoutA{tg::SfLayout::R128c4};
|
||||
// Scale factors layout for B.
|
||||
@ -334,9 +347,18 @@ struct GemmOptions
|
||||
enum class SmVersion
|
||||
{
|
||||
Sm90a,
|
||||
Sm100a
|
||||
Sm100a,
|
||||
Sm100f,
|
||||
Sm103a
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
bool isSmVersionBlackwell(SmVersion smVersion)
|
||||
{
|
||||
return smVersion == SmVersion::Sm100a || smVersion == SmVersion::Sm100f || smVersion == SmVersion::Sm103a;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// GemmConfig
|
||||
@ -478,6 +500,16 @@ inline std::string dumpOptions(GemmOptions const& options)
|
||||
ss << "mUseTmaStore=" << options.mUseTmaStore << "," << std::endl;
|
||||
ss << "mUseTwoTmaLoadWarps=" << options.mUseTwoTmaLoadWarps << "," << std::endl;
|
||||
ss << "mUseTwoMmaWarps=" << options.mUseTwoMmaWarps << "," << std::endl;
|
||||
if (options.mSfBlockSizeA.has_value())
|
||||
{
|
||||
ss << "mSfBlockSizeA=" << options.mSfBlockSizeA.value() << "," << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
ss << "mSfBlockSizeA="
|
||||
<< "std::nullopt"
|
||||
<< ", " << std::endl;
|
||||
}
|
||||
ss << "mSfLayoutA="
|
||||
<< "trtllm::gen::SfLayout(" << static_cast<int32_t>(options.mSfLayoutA) << ")"
|
||||
<< "," << std::endl;
|
||||
@ -527,6 +559,7 @@ inline int32_t getShuffleBlockSize(int epilogueTileM)
|
||||
inline bool checkAndUpdateGemmOptions(
|
||||
GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true)
|
||||
{
|
||||
|
||||
if (options.mDtypeB == tg::Dtype::Void)
|
||||
{
|
||||
if (updateOptions)
|
||||
@ -567,7 +600,8 @@ inline bool checkAndUpdateGemmOptions(
|
||||
// Currently, we only support {MxFp4, NvFp4} -> Bf16.
|
||||
TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA)
|
||||
|| ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1)
|
||||
&& options.mDtypeMmaA == tg::Dtype::Bfloat16),
|
||||
&& options.mDtypeMmaA == tg::Dtype::Bfloat16)
|
||||
|| (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3),
|
||||
"Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA));
|
||||
|
||||
// Check that the B cast is supported.
|
||||
@ -716,7 +750,20 @@ inline bool checkAndUpdateGemmOptions(
|
||||
{
|
||||
TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell");
|
||||
|
||||
int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32;
|
||||
int mmaK = 32;
|
||||
if (options.mMmaKind == tg::MmaKind::MxFp4NvFp4)
|
||||
{
|
||||
if (options.mMmaK == 96)
|
||||
{
|
||||
mmaK = 96;
|
||||
TLLM_CHECK_ERROR(options.mTileK == 768, "When mmaK == 96, only tileK == 768 is supported");
|
||||
TLLM_CHECK_ERROR(options.mTileN <= 128, "When mmaK == 96, only tileN <= 128 is supported");
|
||||
}
|
||||
else
|
||||
{
|
||||
mmaK = 64;
|
||||
}
|
||||
}
|
||||
if (options.mMmaK != mmaK)
|
||||
{
|
||||
int newTileK = mmaK * divUp(options.mTileK, mmaK);
|
||||
@ -737,9 +784,27 @@ inline bool checkAndUpdateGemmOptions(
|
||||
TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN,
|
||||
") must be >= 64 or equal to TileN (", options.mTileN, ")");
|
||||
}
|
||||
|
||||
if (options.mSfBlockSizeA.has_value())
|
||||
{
|
||||
// Only E2m1 x E4m3 is tested. MxE2m1 x bf16 may also work.
|
||||
TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeB == tg::Dtype::E4m3,
|
||||
"sfBlockSizeA is only supported for E2m1 and E4m3 types. Found dtypeA=", tg::dtypeToString(options.mDtypeA),
|
||||
" dtypeB=", tg::dtypeToString(options.mDtypeB));
|
||||
|
||||
// sfBlockSizeA must be 16 or 32.
|
||||
// SfBlockSizeA can also support 64 and 128, although they are not officially supported Nvida
|
||||
// format. Note that the type conversion needs to happen before TCs.
|
||||
// For example, convert e2m1 to e4m3 inside TmemCastA.
|
||||
// If we want to support sfBlockSizeA=8, we can write another version of convertE2m1ToSfE4m3,
|
||||
// which only packs 8 e2m1 elements.
|
||||
TLLM_CHECK_ERROR(options.mSfBlockSizeA.value() == 16 || options.mSfBlockSizeA.value() == 32, "SfBlockSizeA (",
|
||||
options.mSfBlockSizeA.value(), ") must be 16 or 32.");
|
||||
}
|
||||
|
||||
if (tg::dtypeIsBlockFmt(options.mDtypeA))
|
||||
{
|
||||
int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA);
|
||||
int numEltsPerSfA = options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA));
|
||||
TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK,
|
||||
") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA));
|
||||
auto const numEltsPerSfAInK = options.mK / numEltsPerSfA;
|
||||
@ -1293,8 +1358,8 @@ inline bool checkAndUpdateGemmOptions(
|
||||
{
|
||||
// Init kernel traits.
|
||||
options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc,
|
||||
options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK,
|
||||
options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma,
|
||||
options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mMmaK, options.mTileM, options.mTileN,
|
||||
options.mTileK, options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma,
|
||||
options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore,
|
||||
options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent,
|
||||
options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -18,6 +18,7 @@
|
||||
|
||||
#include "trtllm/gen/CommonUtils.h"
|
||||
#include "trtllm/gen/SfLayoutDecl.h"
|
||||
#include <stdexcept>
|
||||
|
||||
#include "BatchedGemmEnums.h"
|
||||
#include "Enums.h"
|
||||
@ -51,11 +52,7 @@ namespace tg = trtllm::gen;
|
||||
namespace KernelParamsSetup
|
||||
{
|
||||
#ifdef TLLM_ENABLE_CUDA
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Member functions.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class MatrixType
|
||||
{
|
||||
MatrixA = 0,
|
||||
@ -63,6 +60,38 @@ enum class MatrixType
|
||||
MatrixC
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Utility functions.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename BatchedGemmOptions>
|
||||
bool useTmaOobOptA(BatchedGemmOptions const& options)
|
||||
{
|
||||
return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM && doesRouteImplUseNoRoute(options.mRouteImpl)
|
||||
&& options.mUseTmaOobOpt;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename BatchedGemmOptions>
|
||||
bool useTmaOobOptB(BatchedGemmOptions const& options)
|
||||
{
|
||||
return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchN && doesRouteImplUseNoRoute(options.mRouteImpl)
|
||||
&& options.mUseTmaOobOpt;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename BatchedGemmOptions>
|
||||
bool useTmaOobOptC(BatchedGemmOptions const& options)
|
||||
{
|
||||
return options.mUseTmaStore && options.mUseTmaOobOpt;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Create the TMA shape/stride for A/B/C.
|
||||
template <class GemmOptions>
|
||||
static auto makeTmaShapeStrideAbc(
|
||||
@ -73,60 +102,83 @@ static auto makeTmaShapeStrideAbc(
|
||||
bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput)
|
||||
|| (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput);
|
||||
|
||||
// Whether to use TMA OOB trick to block out padded dummy tokens and saving BW whenever no routing
|
||||
// is involved. It applies to batchM and matrixA, or batchN and matrixB, or any case for matrixC.
|
||||
bool const useTmaOobOpt = matrixType == MatrixType::MatrixA ? useTmaOobOptA(options)
|
||||
: matrixType == MatrixType::MatrixB ? useTmaOobOptB(options)
|
||||
: matrixType == MatrixType::MatrixC ? useTmaOobOptC(options)
|
||||
: false;
|
||||
|
||||
// The outer dimension.
|
||||
auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN;
|
||||
// The outer dimension tile size.
|
||||
auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM
|
||||
: (matrixType == MatrixType::MatrixA) ? tileM
|
||||
: tileN;
|
||||
auto ctaTileNumTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? tileM : tileN;
|
||||
// The outer dimension of TMA box shape.
|
||||
auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM : ctaTileNumTokens;
|
||||
|
||||
// The inner dimension.
|
||||
auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK;
|
||||
// The inner dimension tile size.
|
||||
auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK;
|
||||
auto ctaTileHiddenSize = (matrixType == MatrixType::MatrixC) ? tileN : tileK;
|
||||
// The inner dimension of TMA box shape.
|
||||
auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : ctaTileHiddenSize;
|
||||
|
||||
// Swap matrix C sizes if output is transpose
|
||||
// Swap matrix C sizes if output is transposed.
|
||||
if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput)
|
||||
{
|
||||
numTokens = mN;
|
||||
hiddenSize = mM;
|
||||
tileNumTokens = options.mEpilogueTileN;
|
||||
tileHiddenSize = options.mEpilogueTileM;
|
||||
std::swap(numTokens, hiddenSize);
|
||||
std::swap(ctaTileNumTokens, ctaTileHiddenSize);
|
||||
std::swap(tileNumTokens, tileHiddenSize);
|
||||
}
|
||||
|
||||
// For a fused activation kernel, the hidden size of output is halved. TODO: That's true for
|
||||
// gated activations but not regular activations.
|
||||
if (options.mFusedAct)
|
||||
if (options.mFusedAct && matrixType == MatrixType::MatrixC)
|
||||
{
|
||||
if (matrixType == MatrixType::MatrixC)
|
||||
{
|
||||
hiddenSize /= 2;
|
||||
tileHiddenSize /= 2;
|
||||
}
|
||||
hiddenSize /= 2;
|
||||
tileHiddenSize /= 2;
|
||||
ctaTileHiddenSize /= 2;
|
||||
}
|
||||
|
||||
// The cute tensor shape for A/B: (numTokens, hiddenSize).
|
||||
// Note that TMA descriptor expects the first dimension's stride to be
|
||||
// 1, so swap the first two dimension so that the hiddenSize dimension comes first.
|
||||
auto shape = std::vector<uint64_t>{static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens)};
|
||||
// If the matrix is a weights matrix, we use 3D logical shape for it (B, M, K) or (B, N, K).
|
||||
// Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
|
||||
if (isWeights)
|
||||
|
||||
// Activations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
|
||||
std::vector<uint64_t> shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens)};
|
||||
if (useTmaOobOpt /* also implies input/output activation */)
|
||||
{
|
||||
shape.push_back(static_cast<uint64_t>(options.mNumBatches));
|
||||
// If TMA OOB optimization is used, we use 3D logical shape (M, tileM, K) or (N, tileN, K).
|
||||
// The outer dimension is extended to make room for the possible counterbalance positive
|
||||
// offset from the middle "bound" dimension. The counterbalance should be no more than
|
||||
// ctaTileNumTokens.
|
||||
shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(ctaTileNumTokens),
|
||||
static_cast<uint64_t>(numTokens + ctaTileNumTokens)};
|
||||
}
|
||||
else if (isWeights)
|
||||
{
|
||||
// If the matrix is a weights matrix, we use 3D logical shape (B, M, K) or (B, N, K).
|
||||
shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens),
|
||||
static_cast<uint64_t>(options.mNumBatches)};
|
||||
}
|
||||
|
||||
// Assemble the stride (strideTokens, 1).
|
||||
// Swap the first two dimension as mentioned before.
|
||||
auto stride = std::vector<uint64_t>{1, static_cast<uint64_t>(hiddenSize)};
|
||||
if (isWeights)
|
||||
std::vector<uint64_t> stride = {1, static_cast<uint64_t>(hiddenSize)};
|
||||
if (useTmaOobOpt)
|
||||
{
|
||||
stride.push_back(static_cast<uint64_t>(hiddenSize * numTokens));
|
||||
stride = {1, static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(hiddenSize)};
|
||||
}
|
||||
else if (isWeights)
|
||||
{
|
||||
stride = {
|
||||
1, static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(hiddenSize) * static_cast<uint64_t>(numTokens)};
|
||||
}
|
||||
|
||||
// Assemble the box shape
|
||||
std::vector<int32_t> tileShape = {tileHiddenSize, tileNumTokens};
|
||||
|
||||
// Alternate layouts do not apply to matrixC
|
||||
// Alternate layouts (MajorMn and BlockMajorK) do not apply to matrixC
|
||||
if (matrixType != MatrixType::MatrixC)
|
||||
{
|
||||
gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB;
|
||||
@ -157,7 +209,7 @@ static auto makeTmaShapeStrideAbc(
|
||||
|
||||
// Create the TMA shape/stride for A/B block scaling factors.
|
||||
static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK,
|
||||
tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor)
|
||||
tg::SfLayout layout, int sfReshapeFactor, const int32_t numEltsPerSf)
|
||||
{
|
||||
|
||||
// The outer dimension.
|
||||
@ -168,8 +220,6 @@ static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType
|
||||
auto numTokensPerTile = matrixType == MatrixType::MatrixA ? tileM : tileN;
|
||||
// The inner tile dimension.
|
||||
auto hiddenSizePerTile = tileK;
|
||||
// Number of elements per scaling factor.
|
||||
const int32_t numEltsPerSf = (dtypeElt == tg::Dtype::E2m1) ? 16 : 32;
|
||||
|
||||
switch (layout)
|
||||
{
|
||||
@ -264,7 +314,7 @@ template <class GemmOptions_>
|
||||
static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, void const* ptrB,
|
||||
void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, void const* ptrPerTokenSfB,
|
||||
void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, float const* ptrClampLimit,
|
||||
float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax,
|
||||
float const* ptrGatedActAlpha, float const* ptrGatedActBeta, int32_t const* routeMap, float* rowMax,
|
||||
uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr,
|
||||
int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr,
|
||||
int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = KernelParams::MaxNumCtas)
|
||||
@ -281,8 +331,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
params.ptrScaleC = ptrScaleC;
|
||||
params.ptrScaleGate = ptrScaleGate;
|
||||
params.ptrClampLimit = ptrClampLimit;
|
||||
params.ptrSwiGluAlpha = ptrSwiGluAlpha;
|
||||
params.ptrSwiGluBeta = ptrSwiGluBeta;
|
||||
params.ptrGatedActAlpha = ptrGatedActAlpha;
|
||||
params.ptrGatedActBeta = ptrGatedActBeta;
|
||||
|
||||
int32_t ctaOffset = 0;
|
||||
|
||||
@ -296,8 +346,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
for (int b = 0; b < options.mNumBatches; b++)
|
||||
{
|
||||
|
||||
int mM = batchM ? options.mBatchedM[b] : options.mN;
|
||||
int mN = batchM ? options.mM : options.mBatchedN[b];
|
||||
int mM = batchM ? options.mBatchedM[b] : options.mM;
|
||||
int mN = batchM ? options.mN : options.mBatchedN[b];
|
||||
|
||||
// Skip Tma descriptor creation if expert isn't used
|
||||
if (mM == 0 || mN == 0)
|
||||
@ -394,9 +444,10 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
|
||||
|
||||
// Build TMA descriptor for gmem A block scaling factors.
|
||||
auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches,
|
||||
options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK,
|
||||
options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor);
|
||||
auto [shapeSfA, strideSfA, tileShapesSfA]
|
||||
= makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, options.mN, options.mK, MatrixType::MatrixA,
|
||||
options.mTileM, options.mTileN, options.mTileK, tg::SfLayout::R128c4, options.mSfReshapeFactor,
|
||||
options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA)));
|
||||
params.tmaSfA[0]
|
||||
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
|
||||
}
|
||||
@ -436,8 +487,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
|
||||
// Build TMA descriptor for gmem B block scaling factors.
|
||||
auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB,
|
||||
options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, options.mDtypeB,
|
||||
options.mSfLayoutB, options.mSfReshapeFactor);
|
||||
options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, options.mSfLayoutB,
|
||||
options.mSfReshapeFactor, tg::dtypeNumEltsPerSf(options.mDtypeB));
|
||||
params.tmaSfB[0]
|
||||
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
|
||||
}
|
||||
@ -501,9 +552,10 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
auto const inputNumTokensSfA = ctaOffset * options.mTileM;
|
||||
|
||||
// Build TMA descriptor for gmem A block scaling factors.
|
||||
auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN,
|
||||
options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, options.mDtypeA,
|
||||
tg::SfLayout::R128c4, options.mSfReshapeFactor);
|
||||
auto [shapeSfA, strideSfA, tileShapesSfA]
|
||||
= makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, options.mK, MatrixType::MatrixA,
|
||||
options.mTileM, options.mTileN, options.mTileK, tg::SfLayout::R128c4, options.mSfReshapeFactor,
|
||||
options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA)));
|
||||
params.tmaSfA[0]
|
||||
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
|
||||
}
|
||||
@ -517,7 +569,7 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
// Build TMA descriptor for gmem B block scaling factors.
|
||||
auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM,
|
||||
options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN,
|
||||
options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor);
|
||||
options.mTileK, options.mSfLayoutB, options.mSfReshapeFactor, tg::dtypeNumEltsPerSf(options.mDtypeB));
|
||||
params.tmaSfB[0]
|
||||
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
|
||||
}
|
||||
@ -562,4 +614,5 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace batchedGemm
|
||||
|
||||
} // namespace batchedGemm
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
|
||||
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
@ -19,6 +18,7 @@
|
||||
|
||||
namespace batchedGemm
|
||||
{
|
||||
|
||||
// This is device code
|
||||
|
||||
struct KernelParams
|
||||
@ -29,9 +29,58 @@ struct KernelParams
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Maximum number of CTAs
|
||||
// Maximum number of CTAs in the batch-token dimension.
|
||||
static constexpr int MaxNumCtas = 2048;
|
||||
|
||||
// NOTE: TMA out-of-bounds optimization for MoE padded tokens:
|
||||
//
|
||||
// Originally the padded tokens is a 2D tensor [hiddenDim, ctaGridDimY * tileN] with stride [1,
|
||||
// hiddenDim] and box size [tileM, tileN] at pointer p. We waste bandwidth bytes since we only
|
||||
// want to load [0, batchEnd) out of the [0, tileN) box size: batchEnd is a runtime variable while
|
||||
// box size needs to be fixed at compile time.
|
||||
//
|
||||
// To deal with this, we reshape the tensor to 3D: [hiddenDim, tileN, ctaGridDimY * tileN] with
|
||||
// stride [1, hiddenDim, hiddenDim] and box size [tileM, tileN, 1]. For the original 2D
|
||||
// tensor,
|
||||
//
|
||||
// Offset Coords [ : , ctaIdxY * tileN ],
|
||||
// Box Sizes [ : , tileN ],
|
||||
// Coords Range [ : , ctaIdxY * tileN : ctaIdxY * tileN + tileN],
|
||||
//
|
||||
// while we only want load the range [ctaIdxY * tileN, ctaIdxY * tileN + batchEnd), 1 <= batchEnd
|
||||
// <= tileN
|
||||
//
|
||||
// For the reshaped 3D tensor,
|
||||
//
|
||||
// Offset Coords [ : , tileN - batchEnd ,
|
||||
// ctaIdxY * tileN + batchEnd ],
|
||||
// Box Sizes [ : , tileN ,
|
||||
// 1 ],
|
||||
// Coords Range [ : , tileN - batchEnd : min(tileN, 2 * tileN - batchEnd),
|
||||
// ctaIdxY * tileN + batchEnd : ctaIdx * tileN + batchEnd + 1],
|
||||
//
|
||||
// while min(tileN, 2 * tileN - batchEnd) always evaluates to tileN. The unwanted tokens are
|
||||
// essentially filtered out by utilizing the OOB feature of TMA. Since the 2nd and 3rd dimension
|
||||
// has the same stride, we end up loading the following (adding the left and right end of the 2nd
|
||||
// and 3rd dimension ranges):
|
||||
//
|
||||
// Effective 2D Coords Range
|
||||
// [ : , tileN + ctaIdxY * tileN : tileN + ctaIdxY * tileN + batchEnd],
|
||||
//
|
||||
// This is exactly the same as the original range except for the offset tileN, thus we also need
|
||||
// to offset the pointer in the opposite direction:
|
||||
//
|
||||
// Ptr (p) -> Ptr (p - tileN * hiddenDim)
|
||||
//
|
||||
// Due to the restrictions of TMA unit, the above operations requires the TMA descriptor and the
|
||||
// underlying buffer be constructed differently:
|
||||
// - Requires valid buffer at (p - tileN * hidden) - needs prepending `tileN` tokens.
|
||||
// - TMA outermost dimension must be extended by `tileN` or loads will OOB in the rightmost side.
|
||||
// The latter is because when batchEnd == tileN, the offset coords in the 3rd dimension becomes
|
||||
// ctaIdxY * tileN + tileN. When ctaIdxY = ctaGridDimY - 1, it becomes ((ctaGridDimY - 1) * tileN
|
||||
// + tileN = ctaGridDimY * tileN which is equal to the 3rd dimension size and will be filtered
|
||||
// out. That's why we need to extend the tensor size by tileN.
|
||||
//
|
||||
// TMA descriptor for A.
|
||||
// Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
|
||||
// makeTmaShapeStrideAbc.
|
||||
@ -211,15 +260,15 @@ struct KernelParams
|
||||
// x_linear = x_linear.clamp(min=-limit, max=limit)
|
||||
float const* ptrClampLimit{nullptr};
|
||||
|
||||
// The alpha and beta for SwiGlu.
|
||||
// The alpha and beta for SwiGlu or GeGlu.
|
||||
// Shape is [B]. One alpha and one beta per tensor in batch.
|
||||
// Alpha is 1.f if nullptr.
|
||||
// Beta is 0.f if nullptr.
|
||||
// The formula:
|
||||
// The formula for SwiGlu (for GeGlu, replace sigmoid with phi):
|
||||
//
|
||||
// out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
|
||||
float const* ptrSwiGluAlpha{nullptr};
|
||||
float const* ptrSwiGluBeta{nullptr};
|
||||
float const* ptrGatedActAlpha{nullptr};
|
||||
float const* ptrGatedActBeta{nullptr};
|
||||
|
||||
// The K dimension. It is the hidden dimension of the input matrices.
|
||||
int32_t k;
|
||||
|
||||
@ -19,7 +19,9 @@
|
||||
#include "Enums.h"
|
||||
#include "trtllm/gen/CommonUtils.h"
|
||||
#include "trtllm/gen/DtypeDecl.h"
|
||||
#include "trtllm/gen/MmaDecl.h"
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace batchedGemm
|
||||
{
|
||||
@ -77,18 +79,29 @@ public:
|
||||
}
|
||||
|
||||
// Returns the offset of the ith chunk
|
||||
int32_t getChunkOffset(int32_t ii) const
|
||||
int32_t getChunkOffsetByName(std::string const& name) const
|
||||
{
|
||||
if (mFirstChunkReuse[ii])
|
||||
for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii)
|
||||
{
|
||||
// Reuse the offset of the 0th chunk.
|
||||
return getChunkOffset(0);
|
||||
if (mSmemChunkNames[ii] == name)
|
||||
{
|
||||
return getChunkOffset(ii);
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Name not found: " + name);
|
||||
}
|
||||
|
||||
// Get offset of ii chunks.
|
||||
auto offset = getOffsetBeforeChunk(ii);
|
||||
// Ensure alignment for the current chunk
|
||||
return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second);
|
||||
// Returns the first chunk reuse flag given chunk name.
|
||||
int getFirstChunkReuseFlagByName(std::string const& name) const
|
||||
{
|
||||
for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii)
|
||||
{
|
||||
if (mSmemChunkNames[ii] == name)
|
||||
{
|
||||
return getFirstChunkReuseFlag(ii);
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Name not found: " + name);
|
||||
}
|
||||
|
||||
// Function to calculate the total size of the SMEM array
|
||||
@ -97,12 +110,6 @@ public:
|
||||
return getOffsetBeforeChunk(static_cast<int32_t>(mNumBytesAndAlignmentPerSmemChunk.size()));
|
||||
}
|
||||
|
||||
// Returns the first chunk reuse flag for the ith chunk.
|
||||
int getFirstChunkReuseFlag(int32_t ii) const
|
||||
{
|
||||
return mFirstChunkReuse[ii];
|
||||
}
|
||||
|
||||
// Print the contents of this object.
|
||||
void print() const
|
||||
{
|
||||
@ -115,6 +122,26 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t getChunkOffset(int32_t ii) const
|
||||
{
|
||||
if (mFirstChunkReuse[ii])
|
||||
{
|
||||
// Reuse the offset of the 0th chunk.
|
||||
return getChunkOffset(0);
|
||||
}
|
||||
|
||||
// Get offset of ii chunks.
|
||||
auto offset = getOffsetBeforeChunk(ii);
|
||||
// Ensure alignment for the current chunk
|
||||
return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second);
|
||||
}
|
||||
|
||||
// Returns the first chunk reuse flag for the ith chunk.
|
||||
int getFirstChunkReuseFlag(int32_t ii) const
|
||||
{
|
||||
return mFirstChunkReuse[ii];
|
||||
}
|
||||
|
||||
// Helper function to calculate padded size
|
||||
int32_t getSizePaddedToAlignment(int32_t size, int32_t alignment) const
|
||||
{
|
||||
@ -139,9 +166,7 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind)
|
||||
{
|
||||
if (mmaKind == tg::MmaKind::Auto)
|
||||
{
|
||||
std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl;
|
||||
assert(false);
|
||||
return -1;
|
||||
throw std::runtime_error("mmaKind != tg::MmaKind::Auto");
|
||||
}
|
||||
if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4)
|
||||
{
|
||||
@ -163,11 +188,11 @@ public:
|
||||
|
||||
// The constructor.
|
||||
KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA,
|
||||
tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM,
|
||||
int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK,
|
||||
int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput,
|
||||
AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA,
|
||||
bool usePerTokenSfB, BiasType biasType)
|
||||
tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t mmaK, int32_t tileM, int32_t tileN, int32_t tileK,
|
||||
int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma,
|
||||
int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore,
|
||||
bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8,
|
||||
bool usePerTokenSfA, bool usePerTokenSfB, BiasType biasType)
|
||||
: mMmaKind{mmaKind}
|
||||
{
|
||||
//
|
||||
@ -470,8 +495,8 @@ public:
|
||||
bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA);
|
||||
// Number of columns for scaling factors of A.
|
||||
auto const numTmemColsSfA = useConstSfA
|
||||
? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileM, 64), 4)
|
||||
: (useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0);
|
||||
? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK), 4)
|
||||
: (useBlockScalingA ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK)) * numStages : 0);
|
||||
// Number of columns for Sf alignment.
|
||||
auto const numColsAlignmentSfA = 4;
|
||||
// No need to reuse TMEM.
|
||||
@ -491,8 +516,8 @@ public:
|
||||
bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB);
|
||||
// Number of columns for scaling factors of B.
|
||||
auto const numTmemColsSfB = useConstSfB
|
||||
? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileN, 64), 4)
|
||||
: (useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0);
|
||||
? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK), 4)
|
||||
: (useBlockScalingB ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK)) * numStages : 0);
|
||||
// Number of columns for Sf alignment.
|
||||
auto const numColsAlignmentSfB = 4;
|
||||
// No need to reuse TMEM.
|
||||
@ -541,14 +566,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits)
|
||||
|
||||
inline int32_t getSmemOffsetLoadA(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(0);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetLoadB(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(1);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -562,64 +587,63 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits)
|
||||
|
||||
inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(2);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetRowMax(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(5);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetSliceK(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(6);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(7);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetBias(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(8);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetBlockAmax(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(9);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits)
|
||||
{
|
||||
return traits.mSmemAllocatorHelper.getChunkOffset(10);
|
||||
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0)
|
||||
{
|
||||
// Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC().
|
||||
return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx);
|
||||
return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -630,28 +654,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0)
|
||||
|
||||
inline int32_t getTmemOffsetD(KernelTraits traits)
|
||||
{
|
||||
return traits.mTmemAllocatorHelper.getChunkOffset(0);
|
||||
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getTmemOffsetA(KernelTraits traits)
|
||||
{
|
||||
return traits.mTmemAllocatorHelper.getChunkOffset(1);
|
||||
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getTmemOffsetSfA(KernelTraits traits)
|
||||
{
|
||||
return traits.mTmemAllocatorHelper.getChunkOffset(2);
|
||||
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline int32_t getTmemOffsetSfB(KernelTraits traits)
|
||||
{
|
||||
return traits.mTmemAllocatorHelper.getChunkOffset(3);
|
||||
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -181,6 +181,8 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st
|
||||
|
||||
if (result != CUDA_SUCCESS)
|
||||
{
|
||||
char const* errorString;
|
||||
cuGetErrorString(result, &errorString);
|
||||
std::stringstream ss;
|
||||
ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
|
||||
|
||||
@ -283,8 +285,10 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector<uint64_t> c
|
||||
|
||||
if (result != CUDA_SUCCESS)
|
||||
{
|
||||
char const* errorString;
|
||||
cuGetErrorString(result, &errorString);
|
||||
std::stringstream ss;
|
||||
ss << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl;
|
||||
ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl;
|
||||
|
||||
ss << "tmaFormat: " << static_cast<int>(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl;
|
||||
|
||||
|
||||
@ -213,7 +213,7 @@
|
||||
"useCudaGraph": true,
|
||||
"biasType": "m",
|
||||
"act": "swiglu",
|
||||
"patchF2fp": true,
|
||||
"patchF2fp": false,
|
||||
"clampLimit": 2
|
||||
}
|
||||
},
|
||||
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a00994f28fd8a090e81b27d5fccd661e7dbeb3638d57bb0b40d906116046a1d8
|
||||
size 687798
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:579a6db1db6d9015a5460c4b169be47768547b9ebddf29e990978480ca340e21
|
||||
size 564401
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f47b426fc1e92ed88dd23f54bc06ec4384857c94c6ab5779f9cb0fa124977e60
|
||||
size 708572
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f63598e080f81bd5524c49e0b44c1c54e64937e55b1aedfe2fb462843012367c
|
||||
size 584335
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:267dc78734e8c78037907a551db531cf85f96910f3370fb066f35815953e1166
|
||||
size 671864
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:941ab2e3f61f132637289dc97f5565f9d6d0940d572a04124a3449629a2900dc
|
||||
size 551623
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:33870b92c93116930164d2bc211580dda35b9334c0e0ac4672a189c0469ea6bc
|
||||
size 704674
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0f4ff03ea3bb558d74d4b8235342c4ae39bd763df713a6f49e3861297272e5e2
|
||||
size 577133
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f154e9d9d71264e281301773fc589bde477959bbae749192132ca4334f4166d9
|
||||
size 728992
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3cde45cc498bc24b416c9f20a183f1ecf93296e5c22319b85abe1a85ab6c57cc
|
||||
size 567953
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3fc4093a459e21201290cef1c1217fa33899088af90286b4f30c3b194dab3346
|
||||
size 748976
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:162dc89ea8cbf624f564e65f2c7aaa1ba595f9aa26629be803462fbd45432573
|
||||
size 587937
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:60d640435b43f2da73acd9af496cd5f367257aa21a84b20fbda805d0f36dd778
|
||||
size 674626
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3b1f3ed94a446bca3ad59721dfcce8d48d14df5d8be98a4e47739df22ef9e59b
|
||||
size 553843
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b67189fb9badfaa46164e92d93b7d6ff5710f7759f3149253e3b4f84432d76db
|
||||
size 706944
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:df1a2b41a4ccfe8794cff472bbceb8fc1501f1fecfd108f2b3ccb9a1d152cc32
|
||||
size 579353
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:86a7589d547ddca569573d7f9aa9b586c7e5310598a4f036cdeab7c47c45fcac
|
||||
size 703782
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1534e134c3ee3313e797f04debd27fd2edf24ac2b4ad35d0936b3db617cc4e15
|
||||
size 578905
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:749eaedcf2c05503d82ddac16b9e32a4e19df0bce2ec3a52387a6327149fa493
|
||||
size 723618
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:383a09dc39a680812055757107892846735fa68ba5d655ef35df3d6294a9067c
|
||||
size 599677
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:27ef0c4f7bd2372e78cd85c43b3885505c0faeb32f9acbb4f0f59885f07a74db
|
||||
size 684148
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:af2a95eee8c93e6e858ccc70e48e2460c0224e60e085bc403bfcb170804e757b
|
||||
size 550637
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2a521e3ca7395b7b0de604d199e41aa0f6c897085602bab99e5e5966b1865f1c
|
||||
size 716464
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5a27faab417980175bedb8032bef16cd7be7b7a334712fa6852ddb7c52ad18e3
|
||||
size 576145
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c775c157681ca26635ceaffe084a1ed11952f08021137ce0e0206e25646ce878
|
||||
size 683894
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3313eda5144a86a3009d4d80bcfbc0da796e14a6fed5597c29cc782edbf81faa
|
||||
size 559905
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f7f6a8a000d3d91ac5ea0e0484dfcddae0834a8bffe5495c03ce5ba5db41044b
|
||||
size 703928
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9c8c68901f54f93e836cb3fb4f2d1b553e4d0d88ed943d9fc5f8fbf3fd68f586
|
||||
size 580679
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dc6fe5e9c7016f3807ef9062e190e1183e1bf9bcdcc61662c330c6aeef6c971d
|
||||
size 666874
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:14c285449fb66d2d4f570884e34d3c157a4dc3e31e54732b9a08eb790cb1e521
|
||||
size 547967
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1eb6fcd0b60b9d627a9db0abbbbf5c9ab75a7efb433bbb4b2996341c1949f155
|
||||
size 544199
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:55fbc0ae83071cf4538507ac0c328c278ef645e4ae76dc579742d2c8c8c2c483
|
||||
size 699192
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:38d80e09ae6a26d2f8e62186cfc9cdb8ebf2535d18f99fa6b0548a9c2dee0f2c
|
||||
size 572687
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:245acbd9299bcc4637ca80a6f211c63cb2e2df61c4ad88de962038f8b75ce35b
|
||||
size 568919
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7dbd4719b557daa126dbdc19d8561a8a30f164d99353bf2df6a86fd4a7876fd3
|
||||
size 625070
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bb8f90a52a64101a20215d80f5314a2567e12bbde2d1f07172a63fd45cf57717
|
||||
size 499157
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:73acfe8ba041176e5ff9b070b37d35009555e63c741a7f6df7c4ba12ddc9743e
|
||||
size 651468
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:99b66a013a7c3177672361397e6aca91ab2c7f40c0bfd9708740de6055ebc428
|
||||
size 518055
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:70ffb474cb37a69b71d3d4fa4798f396a62c3f93d096088ad89f4977221cdb3e
|
||||
size 632568
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e3356aac38b9cbd95278d93033c83cd7396eab034c3e055a0a037d02ebb4c2a8
|
||||
size 509269
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6176eca42742e3dda6a850ff5616b89b00690fdca7a9350c98a7e0e68623c0cc
|
||||
size 658078
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dde09ccde1787cadb4187cf1fda955cf6ebac98a3e2255dbe4f57302f88267a0
|
||||
size 525505
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:869a4cbbf5767cd6fa40df88b35a67709cc3540746c4a8ed001a92d7ae2b0065
|
||||
size 670456
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:cab9ea767f345e1a458e88e1931070ac0a2fa85a34b49272605a735c35ccff8f
|
||||
size 545185
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:28344c0d4201c752444ff29d76941fca73f926c6916753eccc98b14ee357ef5d
|
||||
size 702972
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4c46395656283eddab5a4111e464197a0ffbdb1b4e5d5ec95402b236e130be8
|
||||
size 562357
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:fdab9ed0c9f48820444f4c4b83e8bc71a6bb1a7a575ee0fae6f37612bd001419
|
||||
size 502487
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:333ffe4dff54e5fd2661594715845c21b38944e4e61462ceb17b7b4a9ca9f79f
|
||||
size 618108
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:362ffe2a2c759083cfc76fe00f1c39a5e8c04db1dadbd065b044151c7f9e2a4f
|
||||
size 528651
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a5493719a699547366f804f9d48aa1370c66a3f846346ae9cc477b9d4be1fefc
|
||||
size 519709
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ea401e13acccae0bb820f8609b397765024eae85eae64e079bb2a8036e5b2139
|
||||
size 644012
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5ce72bac60c42659e72216df441bc40a958e8ea2348893ed48bdf743cd2746c0
|
||||
size 546811
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5fe3927fe0060a52d10c2c1d88df5f4a09a5279c7e04451d857fabaabcfc8435
|
||||
size 422969
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:75c96fd88f6b34de2b2841364ac82a661e14ddb7791d8f9d334024c22ffe48fa
|
||||
size 353741
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8fdfda724b352141be7abdf193ea436ed4ca849a5cd28973d8782ed912c5ade0
|
||||
size 349677
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7e05eee158202502eb5f0db236294c7d7536a2ecc4c5960cfe337191a2986e74
|
||||
size 444385
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6b88a4fabac9c6b4dc123c71460a7ec3dfc76b2c8e6c5657d3c90642a522ba44
|
||||
size 376439
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bd7082a9bbc9ae814c60a678f8f16f3c3904f3bcd4de57ddc556b6dd6e721c18
|
||||
size 372425
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:41e61d3b107feb291c6c48a743dd4564cd45a2c4d4250abe8f1cac61629da941
|
||||
size 624208
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f6fcbe4db5e85cdc8771b3d3fbbce4323cde2ca4d1564deee41edbc21c487788
|
||||
size 629514
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:811a3e628f108a339865472c185a9e7d81e7f6a5a15ff016864fb0589c9bfb7c
|
||||
size 521135
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d52dc5c75133d00907ca1b4cd23c78dd4f2344bf53b379a86ec2394884901046
|
||||
size 519141
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f96b13880ed8cbcb0ef52f130e88e6b5be98d02ebe6065193358b5c3397a04eb
|
||||
size 644784
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ffd0964d373cfcc548e0acc777a0b00eb6efd3628221532d8ba0ade59c02e4a8
|
||||
size 650238
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2bb9750f2d9777c3343b6cd115edab8eff1db02209c0e22305c81ca2589687fc
|
||||
size 539195
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4deb54c0eb6e8b453c767ce1bf9b54d2cbe794145de626cae12b2497e00773b2
|
||||
size 537399
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4bf181d0b77abd2a9a0ed3a6a442fb3dc857cb4c08d25809c7552c22d18b761
|
||||
size 628500
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b8a56ae3ab37fae2dddb25b69bd2d606f88b2aef89b7a22daba62c50c44df5f8
|
||||
size 656944
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:713e4d0aaebf35baee311932b5d4209e38bbc048ad3a4e857deeed622568d35b
|
||||
size 525427
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:32c4bd4ebac53d09339bdd53e9da05e91e193286d3b3ff43dc1467ccf2e06e03
|
||||
size 542525
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:eb2e4b0c44e6564f61ec1431f0a8e539ba13fc223972875ae876ffe0a338a198
|
||||
size 649026
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:efe284fb3b81c595fe677a6c536897b7aa9bcbcbb6f5743d6bfe5f7d66da8227
|
||||
size 677668
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ae853f01146089fdf753f7fd205efb056594c90338cfbeb2e7857160650f86a1
|
||||
size 544227
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4c22dbdcbd1b30b867f24566cfc4fed5f2319ce5f94f4599d10b1c2175da6e5
|
||||
size 560733
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2179108f87b5ed9bf0db89f10e6a430aec24447c6b0970d7203a17fbe3add022
|
||||
size 636048
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0bb0b13c0e2d2b9923c67cbe7ab8fe83e44f98c6e6f28064ab9f46eb82f52ebb
|
||||
size 715206
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:20113c7a7f5035ec72d861321115163d3a98f56005cb3c2a90cfd036c83ba019
|
||||
size 533765
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0fe9eadc9b4258ba1028110434aeb3a177e42bcef4a243dcabb2bc5112ac23c8
|
||||
size 588207
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f2fd20d74c9346c0e2f5fc623c1ebfce9c7843b137b23bc0d3b0ea1ead4fc500
|
||||
size 657364
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:43b4fcad2e0d78292aebbec555652c6e3a3e4c9ccf684cb00f643ecba603b5bc
|
||||
size 739286
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9b504b4cdb647b2776962e94d4cb29b588db389ec598dfa5976a26bf462aa783
|
||||
size 551825
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1f92c775204623972f3b0e1011841212e2dde72cfedaea191059f6b210a526e7
|
||||
size 607205
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8e3c788929d5f84ad15759fd38c410fc81d8ff47ad397fa3eeddde16f48ac0a7
|
||||
size 621242
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:fcca2cd8c24765aa9d7c6613193c5860be0957d825450329e1b2c2d655b62555
|
||||
size 634738
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:06fde6f5bef399e127d1d193c55fde8426ec5c7e31a43b8b98dbfdc2f368cadd
|
||||
size 518071
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5c63310ec110aeeae459a0e0a5d31f2ae1debad18a951c564109966958e95688
|
||||
size 530087
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:642e348a5d1460393d41d3f6e066d563bf793c9bc53e782b9bcbf53aef654cc8
|
||||
size 641670
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5d0daef205c07d0160f56756d4f3b3dfb2da133195ee83b64865ec6b343f2b5c
|
||||
size 656252
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user