[TRTLLM-4629] [feat] Step1: trtllm-gen kernels support sm103

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
Xiwen Yu 2025-09-06 00:32:51 +08:00
parent 25389c9fe2
commit cca347e6b4
3131 changed files with 21998 additions and 13596 deletions

View File

@ -111,6 +111,8 @@ constexpr int32_t kSM_86 = 86;
constexpr int32_t kSM_89 = 89;
constexpr int32_t kSM_90 = 90;
constexpr int32_t kSM_100 = 100;
constexpr int32_t kSM_100f = 10100;
constexpr int32_t kSM_103 = 103;
constexpr int32_t kSM_120 = 120;
constexpr int32_t kSM_121 = 121;

View File

@ -218,8 +218,8 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
gemmData.mInputBuffers.mPtrPerTokenSfA = mOptions.transposeMmaOutput ? perTokensSfB : perTokensSfA;
gemmData.mInputBuffers.mPtrPerTokenSfB = mOptions.transposeMmaOutput ? perTokensSfA : perTokensSfB;
gemmData.mInputBuffers.mPtrBias = ptrBias;
gemmData.mInputBuffers.mPtrSwiGluAlpha = ptrAlpha;
gemmData.mInputBuffers.mPtrSwiGluBeta = ptrBeta;
gemmData.mInputBuffers.mPtrGatedActAlpha = ptrAlpha;
gemmData.mInputBuffers.mPtrGatedActBeta = ptrBeta;
gemmData.mInputBuffers.mPtrClampLimit = ptrClampLimit;
gemmData.mInputBuffers.mPtrRouteMap = routeMap;

View File

@ -247,22 +247,47 @@ struct BatchedGemmData
// The clamp limit for the accumulator before applying the activation.
// Shape is [B].
// Clamp is INF if nullptr.
// When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb.
// If applied on SwiGlu, it will be:
//
// x_glu = x_glu.clamp(min=None, max=limit)
// x_linear = x_linear.clamp(min=-limit, max=limit)
//
// The given clamp limit applies to the dequantized values, so the order of operations would
// look something like this:
//
// x0 = x0 * dqAb
// x0 = clamp(x0, none, limit)
// x0 = x0 * sigmoid(alpha * x0)
// x1 = dqAb * x1
// x1 = clamp(x1, -limit, limit)
// out = qC * (x1 + beta) * x0
//
// Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp
// limit and apply the clamping prior to dequantization:
//
// x0 = clamp(x0, none, limit / dqAb)
// x0 = x0 * dqAb
// x0 = x0 * sigmoid(alpha * x0)
// x1 = clamp(x1, -limit / dqAb, limit / dqAb)
// scaleC = dqAb * qC
// beta' = beta / dqAb
// out = scaleC * (x1 + beta') * x0
//
// Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case
//
float const* mPtrClampLimit{nullptr};
// The alpha and beta for SwiGlu.
// The alpha and beta for SwiGlu or GeGlu.
// gatedActivation <- (x0 + beta) * activation(x1, alpha)
// Shape is [B].
// Alpha is 1.f if nullptr.
// Beta is 0.f if nullptr.
// The formula:
// The formula for SwiGlu (for GeGlu, replace sigmoid with phi):
//
// out_glu = x_glu * torch.sigmoid(alpha * x_glu) + (x_linear + beta)
float const* mPtrSwiGluAlpha{nullptr};
float const* mPtrSwiGluBeta{nullptr};
// out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
float const* mPtrGatedActAlpha{nullptr};
float const* mPtrGatedActBeta{nullptr};
// Param is used when the kernel is configured with -routeAct true.
// The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or
@ -432,10 +457,48 @@ public:
// Returns the number of available cubin configurations
size_t getNumBatchedGemmConfigs() const;
// Returns the number of CTAs of the last launched kernel.
int32_t getNumCtas() const
// Returns the grid dimensions of the current kernel.
std::tuple<int32_t, int32_t, int32_t> getGridDim(
BatchedGemmOptions const& options, std::optional<int32_t> maxNumCtasInBatchDim = std::nullopt) const
{
return mNumCtas;
bool const batchM = options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM;
int32_t numCtasBatch{0};
// For normal BMM, mNumTokens == 0 and the number of CTAs is known to host.
if (options.mIsStaticBatch)
{
for (int32_t bi = 0; bi < options.mNumBatches; ++bi)
{
numCtasBatch += batchM ? gemm::divUp(options.mBatchedM[bi], options.mTileM)
: gemm::divUp(options.mBatchedN[bi], options.mTileN);
}
}
// For MoE, mNumTokens != 0 and the number of CTAs is known only at runtime.
// We launch maximally possible number of CTAs and use ptrNumNonExitingCtas to determine the
// actual number of CTAs to run.
else if ((options.mEnablesEarlyExit || options.mEnablesDelayedEarlyExit) && options.mNumTokens != 0)
{
assert(maxNumCtasInBatchDim.has_value()
&& "maxNumCtasInBatchDim must be provided when options.mNumTokens != 0");
numCtasBatch = maxNumCtasInBatchDim.value();
}
else
{
throw std::invalid_argument("Invalid combination of options");
}
int32_t const numCtasTile
= batchM ? gemm::divUp(options.mN, options.mTileN) : gemm::divUp(options.mM, options.mTileM);
int32_t const numCtasInner = options.mNumSlicesForSplitK;
return std::make_tuple(numCtasBatch, numCtasTile, numCtasInner);
}
// Returns the number of CTAs of the current kernel.
int32_t getNumCtas(
BatchedGemmOptions const& options, std::optional<int32_t> maxNumCtasInBatchDim = std::nullopt) const
{
auto [numCtasBatch, numCtasTile, numCtasInner] = getGridDim(options, maxNumCtasInBatchDim);
return numCtasBatch * numCtasTile * numCtasInner;
}
// Returns true if the configuration of the cubin can be executed for the given params.
@ -453,10 +516,6 @@ private:
// Returns the size padded to the alignment
size_t getSizePaddedToAlignment(size_t size, size_t alignment) const;
private:
// Number of the CTAs of the last launched kernel.
int32_t mNumCtas{0};
};
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -518,7 +577,7 @@ bool BatchedGemmInterface::isValidConfig(BatchedGemmConfig const& config, Batche
auto options = getOptionsFromConfigAndData(config, data);
// Is Blackwell?
bool isBlackwell = config.mSm == gemm::SmVersion::Sm100a;
bool isBlackwell = gemm::isSmVersionBlackwell(config.mSm);
// Check options without modifications.
return checkAndUpdateBatchedGemmOptions(options, isBlackwell,
@ -629,46 +688,23 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
}
}
int32_t numCtaXy{0};
if (options.mIsStaticBatch)
{
for (int32_t bi = 0; bi < options.mNumBatches; ++bi)
{
numCtaXy += batchM ? gemm::divUp(options.mBatchedM[bi], options.mTileM)
: gemm::divUp(options.mBatchedN[bi], options.mTileN);
}
}
int32_t maxNumCtasInBatchDim{numCtaXy};
// For normal BMM, mNumTokens == 0 and the number of CTAs is known to host.
// For MoE, mNumTokens != 0 and the number of CTAs is known only at runtime.
// We launch maximally possible number of CTAs and use ptrNumNonExitingCtas to determine
// the actual number of CTAs to run.
if ((options.mEnablesEarlyExit || options.mEnablesDelayedEarlyExit) && options.mNumTokens != 0)
{
// Get maximum number of CTAs in batch dim.
maxNumCtasInBatchDim = batchedGemmData.mProblemDimensions.mMaxNumCtasInTokenDim;
}
auto const numCtaX = batchM ? maxNumCtasInBatchDim : gemm::divUp(options.mM, options.mTileM);
auto const numCtaY = batchM ? gemm::divUp(options.mN, options.mTileN) : maxNumCtasInBatchDim;
auto const numCtaZ = options.mNumSlicesForSplitK;
mNumCtas = numCtaX * numCtaY * numCtaZ;
auto [numCtaBatch, numCtaTile, numCtaInner]
= getGridDim(options, batchedGemmData.mProblemDimensions.mMaxNumCtasInTokenDim);
auto kernelParams = KernelParamsSetup::setKernelParams(options, batchM, batchedGemmData.mInputBuffers.mPtrA,
batchedGemmData.mInputBuffers.mPtrB, batchedGemmData.mOutputBuffers.mPtrC,
batchedGemmData.mInputBuffers.mPtrSfA, batchedGemmData.mInputBuffers.mPtrSfB,
batchedGemmData.mInputBuffers.mPtrPerTokenSfA, batchedGemmData.mInputBuffers.mPtrPerTokenSfB,
batchedGemmData.mInputBuffers.mPtrBias, batchedGemmData.mOutputBuffers.mPtrSfC,
batchedGemmData.mInputBuffers.mPtrScaleC, batchedGemmData.mInputBuffers.mPtrScaleGate,
batchedGemmData.mInputBuffers.mPtrClampLimit, batchedGemmData.mInputBuffers.mPtrSwiGluAlpha,
batchedGemmData.mInputBuffers.mPtrSwiGluBeta, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax,
batchedGemmData.mInputBuffers.mPtrClampLimit, batchedGemmData.mInputBuffers.mPtrGatedActAlpha,
batchedGemmData.mInputBuffers.mPtrGatedActBeta, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax,
dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx,
batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, maxNumCtasInBatchDim);
batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, numCtaBatch);
// The size of the grid.
std::vector<int32_t> grid{numCtaX, numCtaY, numCtaZ};
std::vector<int32_t> grid = batchM ? std::vector<int32_t>{numCtaBatch, numCtaTile, numCtaInner}
: std::vector<int32_t>{numCtaTile, numCtaBatch, numCtaInner};
#ifdef TLLM_GEN_EXPORT_INTERFACE
CUmodule cuModule;

View File

@ -20,6 +20,7 @@
#include "GemmGatedActOptions.h"
#include "GemmOptions.h"
#include <cstdint>
#include <vector>
#ifndef TLLM_GEN_EXPORT_INTERFACE
@ -32,17 +33,19 @@
if (!(cond)) \
{ \
printArgs(__VA_ARGS__); \
printArgs("\n"); \
return false; \
}
#define TLLM_LOG_ERROR(...) TLLM_CHECK_ERROR(false, __VA_ARGS__)
#define TLLM_CHECK_ERROR_FMT(...) TLLM_CHECK_ERROR(false, __VA_ARGS__)
#define TLLM_CHECK_ERROR_FMT(cond, ...) TLLM_CHECK_ERROR(cond, __VA_ARGS__)
#define TLLM_CHECK_WARNING(cond, ...) \
if (!(cond)) \
{ \
printArgs(__VA_ARGS__); \
printArgs("\n"); \
return false; \
}
@ -50,7 +53,7 @@
#define TLLM_LOG_INFO(...) TLLM_CHECK_WARNING(false, __VA_ARGS__)
#endif
#endif // TLLM_GEN_EXPORT_INTERFACE
namespace batchedGemm
{
@ -95,11 +98,12 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
bool useShuffledMatrixA, bool sliceK, gemm::SplitK splitK, bool transposeMmaOutput, int tileM, int tileN,
int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule,
bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps,
bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, bool clampBeforeAct,
std::vector<int> batchedM, std::vector<int> batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch,
int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting, bool fusedAct,
int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps)
bool useTwoMmaWarps, std::optional<int32_t> sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB,
tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler,
gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector<int> batchedM, std::vector<int> batchedN,
BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl,
bool gridWaitForPrimaryRouting, bool fusedAct, int numRegsPerThreadNonEpilogueWarp,
int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt)
: gemmGatedAct::GemmGatedActOptions(
gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA,
dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs,
@ -110,21 +114,22 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, patchF2fp,
useShuffledMatrixA, sliceK, splitK, transposeMmaOutput, tileM, tileN, tileK, useUnrollLoop2xForMma,
useCustomMmaSchedule, useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8, usePerTokenSfA,
usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfLayoutA, sfLayoutB, sfLayoutC,
sfReshapeFactor, tileScheduler),
usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfBlockSizeA, sfLayoutA, sfLayoutB,
sfLayoutC, sfReshapeFactor, tileScheduler),
actType, clampBeforeAct)
, mBatchedM(batchedM)
, mBatchedN(batchedN)
, mBatchMode(BatchMode(batchMode))
, mNumBatches(numBatches)
, mIsStaticBatch(isStaticBatch)
, mNumTokens(numTokens)
, mRouteImpl(routeImpl)
, mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting)
, mFusedAct(fusedAct)
, mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting)
, mIsStaticBatch(isStaticBatch)
, mNumBatches(numBatches)
, mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp)
, mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp)
, mNumRegsCastAWarps(numRegsCastAWarps)
, mNumTokens(numTokens)
, mRouteImpl(routeImpl)
, mUseTmaOobOpt(useTmaOobOpt)
{
}
@ -134,28 +139,28 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
std::vector<int> mBatchedN;
// Whether batching M or N.
BatchMode mBatchMode{BatchMode::BatchM};
// Number of Gemm batches.
int mNumBatches;
// Whether the batch size is static (i.e. known at kernel launch time).
bool mIsStaticBatch{true};
// Total number of tokens.
int mNumTokens{32};
// Whether load the input tokens and do routing.
RouteImpl mRouteImpl{RouteImpl::NoRoute};
// Whether to perform a fused gated activation.
bool mFusedAct{false};
// Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens,
// ptrCtaIdxXyToBatchIdx, etc.. should wait on a grid dependency.
bool mGridWaitForPrimaryRouting{true};
// Whether to perform a fused gated activation.
bool mFusedAct{false};
// Whether the batch size is static (i.e. known at kernel launch time).
bool mIsStaticBatch{true};
// Number of Gemm batches.
int mNumBatches;
// Number of registers per thread for non-epilogue warps
int mNumRegsPerThreadNonEpilogueWarp{0};
// Number of registers per thread for epilogue warps
int mNumRegsPerThreadEpilogueWarp{0};
// Number of registers for the cast A warps.
int mNumRegsCastAWarps{0};
// Total number of tokens.
int mNumTokens{32};
// Whether load the input tokens and do routing.
RouteImpl mRouteImpl{RouteImpl::NoRoute};
// Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in
// BatchedGemm/KernelParamsDecl.h.
bool mUseTmaOobOpt{false};
};
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -165,6 +170,20 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
{
bool isValid = true;
if (options.mUseTmaOobOpt && !options.mUseTwoTmaLoadWarps)
{
if (updateOptions)
{
// Since any routing (mRouteAct != NoRoute) requires mUseTwoTmaLoadWarps == true.
// Single TMA load warp is not the target use case for OOB optimization.
options.mUseTmaOobOpt = false;
}
else if (!options.mUseTwoTmaLoadWarps)
{
TLLM_CHECK_ERROR(false, "TMA OOB optimization requires two TMA load warps.");
return false;
}
}
if (options.mFusedAct)
{
// ensure that we check the fused options as well
@ -198,22 +217,19 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
}
}
for (int b = 0; b < options.mNumBatches; b++)
if (batchM)
{
if (batchM)
{
TLLM_CHECK_ERROR(options.mN > 0 && options.mK > 0, "N and K must be larger than 0");
TLLM_CHECK_ERROR(options.mN >= options.mTileN, "N must be equal or larger than TileN.");
TLLM_CHECK_ERROR(options.mN % options.mTileN == 0, "N must be divisible by TileN.");
TLLM_CHECK_ERROR(!options.mTransposeMmaOutput, "When batchM the MMA output has to be in row-major.");
}
else
{
TLLM_CHECK_ERROR(options.mM > 0 && options.mK > 0, "M and K must be larger than 0");
TLLM_CHECK_ERROR(options.mM >= options.mTileM, "N must be equal or larger than tileN.");
TLLM_CHECK_ERROR(options.mM % options.mTileM == 0, "M must be divisible by TileM.");
TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "When batchN the MMA output has to be in column-major.");
}
TLLM_CHECK_ERROR(options.mN > 0 && options.mK > 0, "N and K must be larger than 0");
TLLM_CHECK_ERROR(options.mN >= options.mTileN, "N must be equal or larger than TileN.");
TLLM_CHECK_ERROR(options.mN % options.mTileN == 0, "N must be divisible by TileN.");
TLLM_CHECK_ERROR(!options.mTransposeMmaOutput, "When batchM the MMA output has to be in row-major.");
}
else
{
TLLM_CHECK_ERROR(options.mM > 0 && options.mK > 0, "M and K must be larger than 0");
TLLM_CHECK_ERROR(options.mM >= options.mTileM, "M must be equal or larger than TileM.");
TLLM_CHECK_ERROR(options.mM % options.mTileM == 0, "M must be divisible by TileM.");
TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "When batchN the MMA output has to be in column-major.");
}
if (options.mUseDeepSeekFp8)
@ -367,7 +383,8 @@ inline std::string dumpOptions(BatchedGemmOptions const& options)
ss << "mFusedAct=" << options.mFusedAct << "," << std::endl;
ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << "," << std::endl;
ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << "," << std::endl;
ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << std::endl;
ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl;
return ss.str();
}

View File

@ -67,7 +67,12 @@ enum class ActType
// beta' = beta / scaleAb, scaleC' = scaleC * scaleAb.
//
// GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0.
SwiGlu
SwiGlu,
// For ActType == GeGlu, we use the simplified version
// gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * phi(alpha * x1 * scaleGate)),
// where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales,
// beta' = beta / scaleAb, scaleC' = scaleC * scaleAb.
GeGlu,
};
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -81,6 +86,7 @@ enum class ActType
}
TLLM_ACT_TYPE_FUNCTION(SwiGlu)
TLLM_ACT_TYPE_FUNCTION(GeGlu)
#undef TLLM_ACT_TYPE_FUNCTION
@ -91,6 +97,7 @@ inline std::string getActTypeName(ActType type)
switch (type)
{
case ActType::SwiGlu: return "SwiGlu";
case ActType::GeGlu: return "GeGlu";
default: return "Unknown type";
}
}
@ -179,7 +186,7 @@ inline std::string dumpOptions(GemmGatedActOptions const& options)
ss << gemm::dumpOptions(options) << ", ";
ss << "mActType="
<< "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << ")," << std::endl;
ss << "mClampLimit=" << options.mClampBeforeAct << "," << std::endl;
ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl;
return ss.str();
}

View File

@ -16,6 +16,7 @@
*/
#pragma once
#include <optional>
#include <set>
#include <sstream>
@ -31,23 +32,30 @@
#else
#include <iostream>
template <typename T>
void printArgs(T arg)
{
#ifdef TLLM_GEN_DEBUG
std::cout << arg;
#endif
}
template <typename T, typename... Args>
void printArgs(T first, Args... args)
{
#ifdef TLLM_GEN_DEBUG
std::cout << first;
printArgs(first);
if constexpr (sizeof...(args) > 0)
{
std::cout << " ";
printArgs(", ");
printArgs(args...);
}
#endif
}
#define TLLM_CHECK_ERROR(cond, ...) \
if (!(cond)) \
{ \
printArgs(__VA_ARGS__); \
printArgs("\n"); \
return false; \
}
@ -59,6 +67,7 @@ void printArgs(T first, Args... args)
if (!(cond)) \
{ \
printArgs(__VA_ARGS__); \
printArgs("\n"); \
return false; \
}
@ -66,7 +75,7 @@ void printArgs(T first, Args... args)
#define TLLM_LOG_INFO(...) TLLM_CHECK_WARNING(false, __VA_ARGS__)
#endif
#endif // TLLM_GEN_EXPORT_INTERFACE
namespace batchedGemm
{
@ -103,8 +112,9 @@ struct GemmOptions
bool patchF2fp, bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM,
int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule,
bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB,
bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB,
tg::SfLayout sfLayoutC, int sfReshapeFactor, TileScheduler tileScheduler)
bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, std::optional<int32_t> sfBlockSizeA,
tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int sfReshapeFactor,
TileScheduler tileScheduler)
: mAllReduceAlgo{allReduceAlgo}
, mBiasType{biasType}
, mBlockK(blockK)
@ -167,6 +177,7 @@ struct GemmOptions
, mUseTmaStore{useTmaStore}
, mUseTwoTmaLoadWarps{useTwoTmaLoadWarps}
, mUseTwoMmaWarps{useTwoMmaWarps}
, mSfBlockSizeA{sfBlockSizeA}
, mSfLayoutA{sfLayoutA}
, mSfLayoutB{sfLayoutB}
, mSfLayoutC{sfLayoutC}
@ -313,6 +324,8 @@ struct GemmOptions
bool mUseTwoTmaLoadWarps{false};
// Use two different warps for MMA tasks. Applicable only to DeepSeek FP8.
bool mUseTwoMmaWarps{false};
// Block size of A. For dtypeA == E2m1 and dtypeB == E4m3.
std::optional<int32_t> mSfBlockSizeA{std::nullopt};
// Scale factors layout for A.
tg::SfLayout mSfLayoutA{tg::SfLayout::R128c4};
// Scale factors layout for B.
@ -334,9 +347,18 @@ struct GemmOptions
enum class SmVersion
{
Sm90a,
Sm100a
Sm100a,
Sm100f,
Sm103a
};
////////////////////////////////////////////////////////////////////////////////////////////////////
bool isSmVersionBlackwell(SmVersion smVersion)
{
return smVersion == SmVersion::Sm100a || smVersion == SmVersion::Sm100f || smVersion == SmVersion::Sm103a;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// GemmConfig
@ -478,6 +500,16 @@ inline std::string dumpOptions(GemmOptions const& options)
ss << "mUseTmaStore=" << options.mUseTmaStore << "," << std::endl;
ss << "mUseTwoTmaLoadWarps=" << options.mUseTwoTmaLoadWarps << "," << std::endl;
ss << "mUseTwoMmaWarps=" << options.mUseTwoMmaWarps << "," << std::endl;
if (options.mSfBlockSizeA.has_value())
{
ss << "mSfBlockSizeA=" << options.mSfBlockSizeA.value() << "," << std::endl;
}
else
{
ss << "mSfBlockSizeA="
<< "std::nullopt"
<< ", " << std::endl;
}
ss << "mSfLayoutA="
<< "trtllm::gen::SfLayout(" << static_cast<int32_t>(options.mSfLayoutA) << ")"
<< "," << std::endl;
@ -527,6 +559,7 @@ inline int32_t getShuffleBlockSize(int epilogueTileM)
inline bool checkAndUpdateGemmOptions(
GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true)
{
if (options.mDtypeB == tg::Dtype::Void)
{
if (updateOptions)
@ -567,7 +600,8 @@ inline bool checkAndUpdateGemmOptions(
// Currently, we only support {MxFp4, NvFp4} -> Bf16.
TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA)
|| ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1)
&& options.mDtypeMmaA == tg::Dtype::Bfloat16),
&& options.mDtypeMmaA == tg::Dtype::Bfloat16)
|| (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3),
"Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA));
// Check that the B cast is supported.
@ -716,7 +750,20 @@ inline bool checkAndUpdateGemmOptions(
{
TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell");
int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32;
int mmaK = 32;
if (options.mMmaKind == tg::MmaKind::MxFp4NvFp4)
{
if (options.mMmaK == 96)
{
mmaK = 96;
TLLM_CHECK_ERROR(options.mTileK == 768, "When mmaK == 96, only tileK == 768 is supported");
TLLM_CHECK_ERROR(options.mTileN <= 128, "When mmaK == 96, only tileN <= 128 is supported");
}
else
{
mmaK = 64;
}
}
if (options.mMmaK != mmaK)
{
int newTileK = mmaK * divUp(options.mTileK, mmaK);
@ -737,9 +784,27 @@ inline bool checkAndUpdateGemmOptions(
TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN,
") must be >= 64 or equal to TileN (", options.mTileN, ")");
}
if (options.mSfBlockSizeA.has_value())
{
// Only E2m1 x E4m3 is tested. MxE2m1 x bf16 may also work.
TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeB == tg::Dtype::E4m3,
"sfBlockSizeA is only supported for E2m1 and E4m3 types. Found dtypeA=", tg::dtypeToString(options.mDtypeA),
" dtypeB=", tg::dtypeToString(options.mDtypeB));
// sfBlockSizeA must be 16 or 32.
// SfBlockSizeA can also support 64 and 128, although they are not officially supported Nvida
// format. Note that the type conversion needs to happen before TCs.
// For example, convert e2m1 to e4m3 inside TmemCastA.
// If we want to support sfBlockSizeA=8, we can write another version of convertE2m1ToSfE4m3,
// which only packs 8 e2m1 elements.
TLLM_CHECK_ERROR(options.mSfBlockSizeA.value() == 16 || options.mSfBlockSizeA.value() == 32, "SfBlockSizeA (",
options.mSfBlockSizeA.value(), ") must be 16 or 32.");
}
if (tg::dtypeIsBlockFmt(options.mDtypeA))
{
int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA);
int numEltsPerSfA = options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA));
TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK,
") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA));
auto const numEltsPerSfAInK = options.mK / numEltsPerSfA;
@ -1293,8 +1358,8 @@ inline bool checkAndUpdateGemmOptions(
{
// Init kernel traits.
options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc,
options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK,
options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma,
options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mMmaK, options.mTileM, options.mTileN,
options.mTileK, options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma,
options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore,
options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent,
options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType);

View File

@ -18,6 +18,7 @@
#include "trtllm/gen/CommonUtils.h"
#include "trtllm/gen/SfLayoutDecl.h"
#include <stdexcept>
#include "BatchedGemmEnums.h"
#include "Enums.h"
@ -51,11 +52,7 @@ namespace tg = trtllm::gen;
namespace KernelParamsSetup
{
#ifdef TLLM_ENABLE_CUDA
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Member functions.
//
//////////////////////////////////////////////////////////////////////////////////////////////////
enum class MatrixType
{
MatrixA = 0,
@ -63,6 +60,38 @@ enum class MatrixType
MatrixC
};
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Utility functions.
//
//////////////////////////////////////////////////////////////////////////////////////////////////
template <typename BatchedGemmOptions>
bool useTmaOobOptA(BatchedGemmOptions const& options)
{
return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM && doesRouteImplUseNoRoute(options.mRouteImpl)
&& options.mUseTmaOobOpt;
}
//////////////////////////////////////////////////////////////////////////////////////////////////
template <typename BatchedGemmOptions>
bool useTmaOobOptB(BatchedGemmOptions const& options)
{
return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchN && doesRouteImplUseNoRoute(options.mRouteImpl)
&& options.mUseTmaOobOpt;
}
//////////////////////////////////////////////////////////////////////////////////////////////////
template <typename BatchedGemmOptions>
bool useTmaOobOptC(BatchedGemmOptions const& options)
{
return options.mUseTmaStore && options.mUseTmaOobOpt;
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Create the TMA shape/stride for A/B/C.
template <class GemmOptions>
static auto makeTmaShapeStrideAbc(
@ -73,60 +102,83 @@ static auto makeTmaShapeStrideAbc(
bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput)
|| (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput);
// Whether to use TMA OOB trick to block out padded dummy tokens and saving BW whenever no routing
// is involved. It applies to batchM and matrixA, or batchN and matrixB, or any case for matrixC.
bool const useTmaOobOpt = matrixType == MatrixType::MatrixA ? useTmaOobOptA(options)
: matrixType == MatrixType::MatrixB ? useTmaOobOptB(options)
: matrixType == MatrixType::MatrixC ? useTmaOobOptC(options)
: false;
// The outer dimension.
auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN;
// The outer dimension tile size.
auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM
: (matrixType == MatrixType::MatrixA) ? tileM
: tileN;
auto ctaTileNumTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? tileM : tileN;
// The outer dimension of TMA box shape.
auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM : ctaTileNumTokens;
// The inner dimension.
auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK;
// The inner dimension tile size.
auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK;
auto ctaTileHiddenSize = (matrixType == MatrixType::MatrixC) ? tileN : tileK;
// The inner dimension of TMA box shape.
auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : ctaTileHiddenSize;
// Swap matrix C sizes if output is transpose
// Swap matrix C sizes if output is transposed.
if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput)
{
numTokens = mN;
hiddenSize = mM;
tileNumTokens = options.mEpilogueTileN;
tileHiddenSize = options.mEpilogueTileM;
std::swap(numTokens, hiddenSize);
std::swap(ctaTileNumTokens, ctaTileHiddenSize);
std::swap(tileNumTokens, tileHiddenSize);
}
// For a fused activation kernel, the hidden size of output is halved. TODO: That's true for
// gated activations but not regular activations.
if (options.mFusedAct)
if (options.mFusedAct && matrixType == MatrixType::MatrixC)
{
if (matrixType == MatrixType::MatrixC)
{
hiddenSize /= 2;
tileHiddenSize /= 2;
}
hiddenSize /= 2;
tileHiddenSize /= 2;
ctaTileHiddenSize /= 2;
}
// The cute tensor shape for A/B: (numTokens, hiddenSize).
// Note that TMA descriptor expects the first dimension's stride to be
// 1, so swap the first two dimension so that the hiddenSize dimension comes first.
auto shape = std::vector<uint64_t>{static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens)};
// If the matrix is a weights matrix, we use 3D logical shape for it (B, M, K) or (B, N, K).
// Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
if (isWeights)
// Activations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
std::vector<uint64_t> shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens)};
if (useTmaOobOpt /* also implies input/output activation */)
{
shape.push_back(static_cast<uint64_t>(options.mNumBatches));
// If TMA OOB optimization is used, we use 3D logical shape (M, tileM, K) or (N, tileN, K).
// The outer dimension is extended to make room for the possible counterbalance positive
// offset from the middle "bound" dimension. The counterbalance should be no more than
// ctaTileNumTokens.
shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(ctaTileNumTokens),
static_cast<uint64_t>(numTokens + ctaTileNumTokens)};
}
else if (isWeights)
{
// If the matrix is a weights matrix, we use 3D logical shape (B, M, K) or (B, N, K).
shape = {static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens),
static_cast<uint64_t>(options.mNumBatches)};
}
// Assemble the stride (strideTokens, 1).
// Swap the first two dimension as mentioned before.
auto stride = std::vector<uint64_t>{1, static_cast<uint64_t>(hiddenSize)};
if (isWeights)
std::vector<uint64_t> stride = {1, static_cast<uint64_t>(hiddenSize)};
if (useTmaOobOpt)
{
stride.push_back(static_cast<uint64_t>(hiddenSize * numTokens));
stride = {1, static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(hiddenSize)};
}
else if (isWeights)
{
stride = {
1, static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(hiddenSize) * static_cast<uint64_t>(numTokens)};
}
// Assemble the box shape
std::vector<int32_t> tileShape = {tileHiddenSize, tileNumTokens};
// Alternate layouts do not apply to matrixC
// Alternate layouts (MajorMn and BlockMajorK) do not apply to matrixC
if (matrixType != MatrixType::MatrixC)
{
gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB;
@ -157,7 +209,7 @@ static auto makeTmaShapeStrideAbc(
// Create the TMA shape/stride for A/B block scaling factors.
static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK,
tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor)
tg::SfLayout layout, int sfReshapeFactor, const int32_t numEltsPerSf)
{
// The outer dimension.
@ -168,8 +220,6 @@ static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType
auto numTokensPerTile = matrixType == MatrixType::MatrixA ? tileM : tileN;
// The inner tile dimension.
auto hiddenSizePerTile = tileK;
// Number of elements per scaling factor.
const int32_t numEltsPerSf = (dtypeElt == tg::Dtype::E2m1) ? 16 : 32;
switch (layout)
{
@ -264,7 +314,7 @@ template <class GemmOptions_>
static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, void const* ptrB,
void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, void const* ptrPerTokenSfB,
void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, float const* ptrClampLimit,
float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax,
float const* ptrGatedActAlpha, float const* ptrGatedActBeta, int32_t const* routeMap, float* rowMax,
uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr,
int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr,
int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = KernelParams::MaxNumCtas)
@ -281,8 +331,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
params.ptrScaleC = ptrScaleC;
params.ptrScaleGate = ptrScaleGate;
params.ptrClampLimit = ptrClampLimit;
params.ptrSwiGluAlpha = ptrSwiGluAlpha;
params.ptrSwiGluBeta = ptrSwiGluBeta;
params.ptrGatedActAlpha = ptrGatedActAlpha;
params.ptrGatedActBeta = ptrGatedActBeta;
int32_t ctaOffset = 0;
@ -296,8 +346,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
for (int b = 0; b < options.mNumBatches; b++)
{
int mM = batchM ? options.mBatchedM[b] : options.mN;
int mN = batchM ? options.mM : options.mBatchedN[b];
int mM = batchM ? options.mBatchedM[b] : options.mM;
int mN = batchM ? options.mN : options.mBatchedN[b];
// Skip Tma descriptor creation if expert isn't used
if (mM == 0 || mN == 0)
@ -394,9 +444,10 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
// Build TMA descriptor for gmem A block scaling factors.
auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches,
options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK,
options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor);
auto [shapeSfA, strideSfA, tileShapesSfA]
= makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, options.mN, options.mK, MatrixType::MatrixA,
options.mTileM, options.mTileN, options.mTileK, tg::SfLayout::R128c4, options.mSfReshapeFactor,
options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA)));
params.tmaSfA[0]
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
}
@ -436,8 +487,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
// Build TMA descriptor for gmem B block scaling factors.
auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB,
options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, options.mDtypeB,
options.mSfLayoutB, options.mSfReshapeFactor);
options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, options.mSfLayoutB,
options.mSfReshapeFactor, tg::dtypeNumEltsPerSf(options.mDtypeB));
params.tmaSfB[0]
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
}
@ -501,9 +552,10 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
auto const inputNumTokensSfA = ctaOffset * options.mTileM;
// Build TMA descriptor for gmem A block scaling factors.
auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN,
options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, options.mDtypeA,
tg::SfLayout::R128c4, options.mSfReshapeFactor);
auto [shapeSfA, strideSfA, tileShapesSfA]
= makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, options.mK, MatrixType::MatrixA,
options.mTileM, options.mTileN, options.mTileK, tg::SfLayout::R128c4, options.mSfReshapeFactor,
options.mSfBlockSizeA.value_or(tg::dtypeNumEltsPerSf(options.mDtypeA)));
params.tmaSfA[0]
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
}
@ -517,7 +569,7 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
// Build TMA descriptor for gmem B block scaling factors.
auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM,
options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN,
options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor);
options.mTileK, options.mSfLayoutB, options.mSfReshapeFactor, tg::dtypeNumEltsPerSf(options.mDtypeB));
params.tmaSfB[0]
= gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
}
@ -562,4 +614,5 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace batchedGemm
} // namespace batchedGemm

View File

@ -1,4 +1,3 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
@ -19,6 +18,7 @@
namespace batchedGemm
{
// This is device code
struct KernelParams
@ -29,9 +29,58 @@ struct KernelParams
//
//////////////////////////////////////////////////////////////////////////////////////////////////
// Maximum number of CTAs
// Maximum number of CTAs in the batch-token dimension.
static constexpr int MaxNumCtas = 2048;
// NOTE: TMA out-of-bounds optimization for MoE padded tokens:
//
// Originally the padded tokens is a 2D tensor [hiddenDim, ctaGridDimY * tileN] with stride [1,
// hiddenDim] and box size [tileM, tileN] at pointer p. We waste bandwidth bytes since we only
// want to load [0, batchEnd) out of the [0, tileN) box size: batchEnd is a runtime variable while
// box size needs to be fixed at compile time.
//
// To deal with this, we reshape the tensor to 3D: [hiddenDim, tileN, ctaGridDimY * tileN] with
// stride [1, hiddenDim, hiddenDim] and box size [tileM, tileN, 1]. For the original 2D
// tensor,
//
// Offset Coords [ : , ctaIdxY * tileN ],
// Box Sizes [ : , tileN ],
// Coords Range [ : , ctaIdxY * tileN : ctaIdxY * tileN + tileN],
//
// while we only want load the range [ctaIdxY * tileN, ctaIdxY * tileN + batchEnd), 1 <= batchEnd
// <= tileN
//
// For the reshaped 3D tensor,
//
// Offset Coords [ : , tileN - batchEnd ,
// ctaIdxY * tileN + batchEnd ],
// Box Sizes [ : , tileN ,
// 1 ],
// Coords Range [ : , tileN - batchEnd : min(tileN, 2 * tileN - batchEnd),
// ctaIdxY * tileN + batchEnd : ctaIdx * tileN + batchEnd + 1],
//
// while min(tileN, 2 * tileN - batchEnd) always evaluates to tileN. The unwanted tokens are
// essentially filtered out by utilizing the OOB feature of TMA. Since the 2nd and 3rd dimension
// has the same stride, we end up loading the following (adding the left and right end of the 2nd
// and 3rd dimension ranges):
//
// Effective 2D Coords Range
// [ : , tileN + ctaIdxY * tileN : tileN + ctaIdxY * tileN + batchEnd],
//
// This is exactly the same as the original range except for the offset tileN, thus we also need
// to offset the pointer in the opposite direction:
//
// Ptr (p) -> Ptr (p - tileN * hiddenDim)
//
// Due to the restrictions of TMA unit, the above operations requires the TMA descriptor and the
// underlying buffer be constructed differently:
// - Requires valid buffer at (p - tileN * hidden) - needs prepending `tileN` tokens.
// - TMA outermost dimension must be extended by `tileN` or loads will OOB in the rightmost side.
// The latter is because when batchEnd == tileN, the offset coords in the 3rd dimension becomes
// ctaIdxY * tileN + tileN. When ctaIdxY = ctaGridDimY - 1, it becomes ((ctaGridDimY - 1) * tileN
// + tileN = ctaGridDimY * tileN which is equal to the 3rd dimension size and will be filtered
// out. That's why we need to extend the tensor size by tileN.
//
// TMA descriptor for A.
// Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
// makeTmaShapeStrideAbc.
@ -211,15 +260,15 @@ struct KernelParams
// x_linear = x_linear.clamp(min=-limit, max=limit)
float const* ptrClampLimit{nullptr};
// The alpha and beta for SwiGlu.
// The alpha and beta for SwiGlu or GeGlu.
// Shape is [B]. One alpha and one beta per tensor in batch.
// Alpha is 1.f if nullptr.
// Beta is 0.f if nullptr.
// The formula:
// The formula for SwiGlu (for GeGlu, replace sigmoid with phi):
//
// out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
float const* ptrSwiGluAlpha{nullptr};
float const* ptrSwiGluBeta{nullptr};
float const* ptrGatedActAlpha{nullptr};
float const* ptrGatedActBeta{nullptr};
// The K dimension. It is the hidden dimension of the input matrices.
int32_t k;

View File

@ -19,7 +19,9 @@
#include "Enums.h"
#include "trtllm/gen/CommonUtils.h"
#include "trtllm/gen/DtypeDecl.h"
#include "trtllm/gen/MmaDecl.h"
#include <cassert>
#include <stdexcept>
namespace batchedGemm
{
@ -77,18 +79,29 @@ public:
}
// Returns the offset of the ith chunk
int32_t getChunkOffset(int32_t ii) const
int32_t getChunkOffsetByName(std::string const& name) const
{
if (mFirstChunkReuse[ii])
for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii)
{
// Reuse the offset of the 0th chunk.
return getChunkOffset(0);
if (mSmemChunkNames[ii] == name)
{
return getChunkOffset(ii);
}
}
throw std::runtime_error("Name not found: " + name);
}
// Get offset of ii chunks.
auto offset = getOffsetBeforeChunk(ii);
// Ensure alignment for the current chunk
return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second);
// Returns the first chunk reuse flag given chunk name.
int getFirstChunkReuseFlagByName(std::string const& name) const
{
for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii)
{
if (mSmemChunkNames[ii] == name)
{
return getFirstChunkReuseFlag(ii);
}
}
throw std::runtime_error("Name not found: " + name);
}
// Function to calculate the total size of the SMEM array
@ -97,12 +110,6 @@ public:
return getOffsetBeforeChunk(static_cast<int32_t>(mNumBytesAndAlignmentPerSmemChunk.size()));
}
// Returns the first chunk reuse flag for the ith chunk.
int getFirstChunkReuseFlag(int32_t ii) const
{
return mFirstChunkReuse[ii];
}
// Print the contents of this object.
void print() const
{
@ -115,6 +122,26 @@ public:
}
private:
int32_t getChunkOffset(int32_t ii) const
{
if (mFirstChunkReuse[ii])
{
// Reuse the offset of the 0th chunk.
return getChunkOffset(0);
}
// Get offset of ii chunks.
auto offset = getOffsetBeforeChunk(ii);
// Ensure alignment for the current chunk
return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second);
}
// Returns the first chunk reuse flag for the ith chunk.
int getFirstChunkReuseFlag(int32_t ii) const
{
return mFirstChunkReuse[ii];
}
// Helper function to calculate padded size
int32_t getSizePaddedToAlignment(int32_t size, int32_t alignment) const
{
@ -139,9 +166,7 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind)
{
if (mmaKind == tg::MmaKind::Auto)
{
std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl;
assert(false);
return -1;
throw std::runtime_error("mmaKind != tg::MmaKind::Auto");
}
if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4)
{
@ -163,11 +188,11 @@ public:
// The constructor.
KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA,
tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM,
int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK,
int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput,
AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA,
bool usePerTokenSfB, BiasType biasType)
tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t mmaK, int32_t tileM, int32_t tileN, int32_t tileK,
int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma,
int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore,
bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8,
bool usePerTokenSfA, bool usePerTokenSfB, BiasType biasType)
: mMmaKind{mmaKind}
{
//
@ -470,8 +495,8 @@ public:
bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA);
// Number of columns for scaling factors of A.
auto const numTmemColsSfA = useConstSfA
? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileM, 64), 4)
: (useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0);
? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK), 4)
: (useBlockScalingA ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK)) * numStages : 0);
// Number of columns for Sf alignment.
auto const numColsAlignmentSfA = 4;
// No need to reuse TMEM.
@ -491,8 +516,8 @@ public:
bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB);
// Number of columns for scaling factors of B.
auto const numTmemColsSfB = useConstSfB
? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileN, 64), 4)
: (useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0);
? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK), 4)
: (useBlockScalingB ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK)) * numStages : 0);
// Number of columns for Sf alignment.
auto const numColsAlignmentSfB = 4;
// No need to reuse TMEM.
@ -541,14 +566,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits)
inline int32_t getSmemOffsetLoadA(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(0);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetLoadB(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(1);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -562,64 +587,63 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits)
inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(2);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0)
{
return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx));
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetRowMax(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(5);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetSliceK(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(6);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(7);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetBias(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(8);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetBlockAmax(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(9);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits)
{
return traits.mSmemAllocatorHelper.getChunkOffset(10);
return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0)
{
// Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC().
return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx);
return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx));
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -630,28 +654,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0)
inline int32_t getTmemOffsetD(KernelTraits traits)
{
return traits.mTmemAllocatorHelper.getChunkOffset(0);
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getTmemOffsetA(KernelTraits traits)
{
return traits.mTmemAllocatorHelper.getChunkOffset(1);
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getTmemOffsetSfA(KernelTraits traits)
{
return traits.mTmemAllocatorHelper.getChunkOffset(2);
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA");
}
////////////////////////////////////////////////////////////////////////////////////////////////////
inline int32_t getTmemOffsetSfB(KernelTraits traits)
{
return traits.mTmemAllocatorHelper.getChunkOffset(3);
return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB");
}
////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -181,6 +181,8 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st
if (result != CUDA_SUCCESS)
{
char const* errorString;
cuGetErrorString(result, &errorString);
std::stringstream ss;
ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl;
@ -283,8 +285,10 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector<uint64_t> c
if (result != CUDA_SUCCESS)
{
char const* errorString;
cuGetErrorString(result, &errorString);
std::stringstream ss;
ss << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl;
ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl;
ss << "tmaFormat: " << static_cast<int>(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl;

View File

@ -213,7 +213,7 @@
"useCudaGraph": true,
"biasType": "m",
"act": "swiglu",
"patchF2fp": true,
"patchF2fp": false,
"clampLimit": 2
}
},

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a00994f28fd8a090e81b27d5fccd661e7dbeb3638d57bb0b40d906116046a1d8
size 687798

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:579a6db1db6d9015a5460c4b169be47768547b9ebddf29e990978480ca340e21
size 564401

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f47b426fc1e92ed88dd23f54bc06ec4384857c94c6ab5779f9cb0fa124977e60
size 708572

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f63598e080f81bd5524c49e0b44c1c54e64937e55b1aedfe2fb462843012367c
size 584335

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:267dc78734e8c78037907a551db531cf85f96910f3370fb066f35815953e1166
size 671864

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:941ab2e3f61f132637289dc97f5565f9d6d0940d572a04124a3449629a2900dc
size 551623

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:33870b92c93116930164d2bc211580dda35b9334c0e0ac4672a189c0469ea6bc
size 704674

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0f4ff03ea3bb558d74d4b8235342c4ae39bd763df713a6f49e3861297272e5e2
size 577133

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f154e9d9d71264e281301773fc589bde477959bbae749192132ca4334f4166d9
size 728992

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3cde45cc498bc24b416c9f20a183f1ecf93296e5c22319b85abe1a85ab6c57cc
size 567953

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3fc4093a459e21201290cef1c1217fa33899088af90286b4f30c3b194dab3346
size 748976

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:162dc89ea8cbf624f564e65f2c7aaa1ba595f9aa26629be803462fbd45432573
size 587937

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:60d640435b43f2da73acd9af496cd5f367257aa21a84b20fbda805d0f36dd778
size 674626

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3b1f3ed94a446bca3ad59721dfcce8d48d14df5d8be98a4e47739df22ef9e59b
size 553843

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b67189fb9badfaa46164e92d93b7d6ff5710f7759f3149253e3b4f84432d76db
size 706944

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:df1a2b41a4ccfe8794cff472bbceb8fc1501f1fecfd108f2b3ccb9a1d152cc32
size 579353

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:86a7589d547ddca569573d7f9aa9b586c7e5310598a4f036cdeab7c47c45fcac
size 703782

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1534e134c3ee3313e797f04debd27fd2edf24ac2b4ad35d0936b3db617cc4e15
size 578905

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:749eaedcf2c05503d82ddac16b9e32a4e19df0bce2ec3a52387a6327149fa493
size 723618

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:383a09dc39a680812055757107892846735fa68ba5d655ef35df3d6294a9067c
size 599677

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:27ef0c4f7bd2372e78cd85c43b3885505c0faeb32f9acbb4f0f59885f07a74db
size 684148

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:af2a95eee8c93e6e858ccc70e48e2460c0224e60e085bc403bfcb170804e757b
size 550637

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2a521e3ca7395b7b0de604d199e41aa0f6c897085602bab99e5e5966b1865f1c
size 716464

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5a27faab417980175bedb8032bef16cd7be7b7a334712fa6852ddb7c52ad18e3
size 576145

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c775c157681ca26635ceaffe084a1ed11952f08021137ce0e0206e25646ce878
size 683894

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3313eda5144a86a3009d4d80bcfbc0da796e14a6fed5597c29cc782edbf81faa
size 559905

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f7f6a8a000d3d91ac5ea0e0484dfcddae0834a8bffe5495c03ce5ba5db41044b
size 703928

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9c8c68901f54f93e836cb3fb4f2d1b553e4d0d88ed943d9fc5f8fbf3fd68f586
size 580679

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dc6fe5e9c7016f3807ef9062e190e1183e1bf9bcdcc61662c330c6aeef6c971d
size 666874

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:14c285449fb66d2d4f570884e34d3c157a4dc3e31e54732b9a08eb790cb1e521
size 547967

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1eb6fcd0b60b9d627a9db0abbbbf5c9ab75a7efb433bbb4b2996341c1949f155
size 544199

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:55fbc0ae83071cf4538507ac0c328c278ef645e4ae76dc579742d2c8c8c2c483
size 699192

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:38d80e09ae6a26d2f8e62186cfc9cdb8ebf2535d18f99fa6b0548a9c2dee0f2c
size 572687

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:245acbd9299bcc4637ca80a6f211c63cb2e2df61c4ad88de962038f8b75ce35b
size 568919

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7dbd4719b557daa126dbdc19d8561a8a30f164d99353bf2df6a86fd4a7876fd3
size 625070

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bb8f90a52a64101a20215d80f5314a2567e12bbde2d1f07172a63fd45cf57717
size 499157

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:73acfe8ba041176e5ff9b070b37d35009555e63c741a7f6df7c4ba12ddc9743e
size 651468

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:99b66a013a7c3177672361397e6aca91ab2c7f40c0bfd9708740de6055ebc428
size 518055

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:70ffb474cb37a69b71d3d4fa4798f396a62c3f93d096088ad89f4977221cdb3e
size 632568

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e3356aac38b9cbd95278d93033c83cd7396eab034c3e055a0a037d02ebb4c2a8
size 509269

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6176eca42742e3dda6a850ff5616b89b00690fdca7a9350c98a7e0e68623c0cc
size 658078

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dde09ccde1787cadb4187cf1fda955cf6ebac98a3e2255dbe4f57302f88267a0
size 525505

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:869a4cbbf5767cd6fa40df88b35a67709cc3540746c4a8ed001a92d7ae2b0065
size 670456

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cab9ea767f345e1a458e88e1931070ac0a2fa85a34b49272605a735c35ccff8f
size 545185

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:28344c0d4201c752444ff29d76941fca73f926c6916753eccc98b14ee357ef5d
size 702972

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a4c46395656283eddab5a4111e464197a0ffbdb1b4e5d5ec95402b236e130be8
size 562357

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fdab9ed0c9f48820444f4c4b83e8bc71a6bb1a7a575ee0fae6f37612bd001419
size 502487

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:333ffe4dff54e5fd2661594715845c21b38944e4e61462ceb17b7b4a9ca9f79f
size 618108

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:362ffe2a2c759083cfc76fe00f1c39a5e8c04db1dadbd065b044151c7f9e2a4f
size 528651

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a5493719a699547366f804f9d48aa1370c66a3f846346ae9cc477b9d4be1fefc
size 519709

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ea401e13acccae0bb820f8609b397765024eae85eae64e079bb2a8036e5b2139
size 644012

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5ce72bac60c42659e72216df441bc40a958e8ea2348893ed48bdf743cd2746c0
size 546811

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5fe3927fe0060a52d10c2c1d88df5f4a09a5279c7e04451d857fabaabcfc8435
size 422969

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:75c96fd88f6b34de2b2841364ac82a661e14ddb7791d8f9d334024c22ffe48fa
size 353741

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8fdfda724b352141be7abdf193ea436ed4ca849a5cd28973d8782ed912c5ade0
size 349677

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7e05eee158202502eb5f0db236294c7d7536a2ecc4c5960cfe337191a2986e74
size 444385

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6b88a4fabac9c6b4dc123c71460a7ec3dfc76b2c8e6c5657d3c90642a522ba44
size 376439

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bd7082a9bbc9ae814c60a678f8f16f3c3904f3bcd4de57ddc556b6dd6e721c18
size 372425

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:41e61d3b107feb291c6c48a743dd4564cd45a2c4d4250abe8f1cac61629da941
size 624208

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:811a3e628f108a339865472c185a9e7d81e7f6a5a15ff016864fb0589c9bfb7c
size 521135

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d52dc5c75133d00907ca1b4cd23c78dd4f2344bf53b379a86ec2394884901046
size 519141

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f96b13880ed8cbcb0ef52f130e88e6b5be98d02ebe6065193358b5c3397a04eb
size 644784

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2bb9750f2d9777c3343b6cd115edab8eff1db02209c0e22305c81ca2589687fc
size 539195

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4deb54c0eb6e8b453c767ce1bf9b54d2cbe794145de626cae12b2497e00773b2
size 537399

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a4bf181d0b77abd2a9a0ed3a6a442fb3dc857cb4c08d25809c7552c22d18b761
size 628500

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:713e4d0aaebf35baee311932b5d4209e38bbc048ad3a4e857deeed622568d35b
size 525427

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:32c4bd4ebac53d09339bdd53e9da05e91e193286d3b3ff43dc1467ccf2e06e03
size 542525

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:eb2e4b0c44e6564f61ec1431f0a8e539ba13fc223972875ae876ffe0a338a198
size 649026

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ae853f01146089fdf753f7fd205efb056594c90338cfbeb2e7857160650f86a1
size 544227

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a4c22dbdcbd1b30b867f24566cfc4fed5f2319ce5f94f4599d10b1c2175da6e5
size 560733

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2179108f87b5ed9bf0db89f10e6a430aec24447c6b0970d7203a17fbe3add022
size 636048

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:20113c7a7f5035ec72d861321115163d3a98f56005cb3c2a90cfd036c83ba019
size 533765

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0fe9eadc9b4258ba1028110434aeb3a177e42bcef4a243dcabb2bc5112ac23c8
size 588207

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f2fd20d74c9346c0e2f5fc623c1ebfce9c7843b137b23bc0d3b0ea1ead4fc500
size 657364

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9b504b4cdb647b2776962e94d4cb29b588db389ec598dfa5976a26bf462aa783
size 551825

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1f92c775204623972f3b0e1011841212e2dde72cfedaea191059f6b210a526e7
size 607205

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8e3c788929d5f84ad15759fd38c410fc81d8ff47ad397fa3eeddde16f48ac0a7
size 621242

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fcca2cd8c24765aa9d7c6613193c5860be0957d825450329e1b2c2d655b62555
size 634738

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:06fde6f5bef399e127d1d193c55fde8426ec5c7e31a43b8b98dbfdc2f368cadd
size 518071

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5c63310ec110aeeae459a0e0a5d31f2ae1debad18a951c564109966958e95688
size 530087

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:642e348a5d1460393d41d3f6e066d563bf793c9bc53e782b9bcbf53aef654cc8
size 641670

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5d0daef205c07d0160f56756d4f3b3dfb2da133195ee83b64865ec6b343f2b5c
size 656252

Some files were not shown because too many files have changed in this diff Show More