mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Merge branch 'main' into user/yuhangh/support_export_data_in_eval
This commit is contained in:
commit
8d858f912e
@ -648,7 +648,7 @@ public:
|
||||
|
||||
void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
|
||||
|
||||
[[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false);
|
||||
|
||||
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
|
||||
@ -853,8 +853,8 @@ public:
|
||||
//! \param blockKeys Key of each block.
|
||||
//! \param blockIds Id of each block.
|
||||
//! \param pinBlocks If true, increment ref count for blocks while storing (pin on store).
|
||||
//! \return Pair of (num blocks stored for reuse, id of the last block stored if any).
|
||||
[[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
|
||||
//! \return Pair of (num blocks stored for reuse, vector of pinned block IDs).
|
||||
[[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
|
||||
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
|
||||
bool pinBlocks = false);
|
||||
|
||||
@ -886,8 +886,8 @@ public:
|
||||
|
||||
[[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);
|
||||
|
||||
//! \brief Unpin blocks by starting from a block id and walking prev pointers.
|
||||
void unpinBlocksById(KVCacheBlock::IdType blockId);
|
||||
//! \brief Unpin blocks by block ids directly
|
||||
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
|
||||
|
||||
void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
|
||||
{
|
||||
@ -1103,7 +1103,7 @@ public:
|
||||
std::optional<KVCacheBlock::IdType> releaseBlocks(
|
||||
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
|
||||
|
||||
[[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
|
||||
|
||||
void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
|
||||
@ -1112,7 +1112,7 @@ public:
|
||||
/// @param sequence The generation request whose blocks should be pinned.
|
||||
void pinBlocks(GenerationRequest& sequence);
|
||||
|
||||
void unpinBlocksById(KVCacheBlock::IdType blockId);
|
||||
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
|
||||
|
||||
void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
|
||||
|
||||
@ -1133,7 +1133,7 @@ public:
|
||||
void offloadBlock(BlockPtr const& block, SizeType32 windowSize,
|
||||
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
|
||||
|
||||
[[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
|
||||
[[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
|
||||
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
|
||||
SizeType32 windowSize, bool pinBlocks = false)
|
||||
{
|
||||
@ -1584,7 +1584,7 @@ public:
|
||||
virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
|
||||
|
||||
/// \brief Store blocks for reuse for a given request id
|
||||
[[nodiscard]] virtual std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
[[nodiscard]] virtual std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false)
|
||||
= 0;
|
||||
|
||||
@ -1678,7 +1678,7 @@ public:
|
||||
BlockKey const& blockKey, SizeType32 windowSize)
|
||||
= 0;
|
||||
|
||||
virtual void unpinBlocksById(KVCacheBlock::IdType blockId) = 0;
|
||||
virtual void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) = 0;
|
||||
};
|
||||
|
||||
class KVCacheManager : public BaseKVCacheManager
|
||||
@ -1939,7 +1939,7 @@ public:
|
||||
//! \brief Store newest blocks for reuse
|
||||
void storeNewBlock(LlmRequest const& llmRequest) override;
|
||||
|
||||
[[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
||||
LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false) override;
|
||||
|
||||
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
|
||||
@ -1960,7 +1960,7 @@ public:
|
||||
|
||||
void pinBlocks(LlmRequest::RequestIdType requestId) override;
|
||||
|
||||
void unpinBlocksById(KVCacheBlock::IdType blockId) override;
|
||||
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) override;
|
||||
|
||||
std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;
|
||||
|
||||
|
||||
@ -1667,6 +1667,12 @@ public:
|
||||
[](auto reason) { return reason == executor::FinishReason::kLENGTH; });
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isFinishedDueToCancellation() const noexcept
|
||||
{
|
||||
return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),
|
||||
[](auto reason) { return reason == executor::FinishReason::kCANCELLED; });
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isTimedOut() const
|
||||
{
|
||||
if (!mAllottedTimeMs.has_value())
|
||||
|
||||
@ -129,6 +129,18 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
|
||||
#define SLIDING_WINDOW 0
|
||||
#endif
|
||||
|
||||
#ifndef SKIP_SOFTMAX_ATTN
|
||||
#define SKIP_SOFTMAX_ATTN 0
|
||||
#endif
|
||||
|
||||
#ifndef SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
#define SKIP_SOFTMAX_ATTN_BLOCK_STATS 0
|
||||
#endif
|
||||
|
||||
#ifndef SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
|
||||
#define SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE 1
|
||||
#endif
|
||||
|
||||
// 0 - no PDL
|
||||
// 1 - naive PDL
|
||||
// 2 - aggressive PDL (implemented only in mha_sm90.cu for now)
|
||||
|
||||
@ -106,6 +106,7 @@ __device__ inline MatDesc makeMatDesc(void const* data, uint32_t dimKByteOffset,
|
||||
asm volatile("trap;\n");
|
||||
return 0;
|
||||
}();
|
||||
assert(__cvta_generic_to_shared(data) % baseAlign == 0);
|
||||
uint32_t const baseOffset = ((patternAddr % baseAlign == 0) ? 0U : ((patternAddr >> 0x7) & 0x7));
|
||||
return MatDesc{
|
||||
/*addr=*/MatDesc::encode(__cvta_generic_to_shared(data)),
|
||||
|
||||
@ -2734,6 +2734,25 @@ static constexpr auto kernel_mha = kernel_mha_impl;
|
||||
#endif
|
||||
|
||||
#ifndef GENERATE_CUBIN
|
||||
uint32_t computeNbSubSeqPerSeqMHA(cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen)
|
||||
{
|
||||
if (!allowMultiBlockMode)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
auto const env = std::getenv("XQA_NB_SUB_SEQ");
|
||||
if (env != nullptr)
|
||||
{
|
||||
int32_t const val = std::stoi(env);
|
||||
if (val > 0)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
}
|
||||
return std::min<uint32_t>(
|
||||
std::max<uint32_t>(1U, prop.multiProcessorCount / (batchSize * nbKHeads)), divUp(maxSeqLen, ctaTile.x));
|
||||
}
|
||||
|
||||
void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
#if SLIDING_WINDOW
|
||||
uint32_t slidingWinSize,
|
||||
@ -2771,6 +2790,13 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
// int8/fp8 KV cache.
|
||||
#if SPEC_DEC
|
||||
SpecDecParams const& specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
float const skipSoftmaxThresholdScaleFactor, // for compatibility with mha_sm90.cu only
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
uint32_t* __restrict__ skippedBlockCount, // for compatibility with mha_sm90.cu only
|
||||
uint32_t* __restrict__ totalBlockCount, // for compatibility with mha_sm90.cu only
|
||||
#endif
|
||||
#endif
|
||||
uint32_t* semaphores, void* scratch, cudaStream_t stream)
|
||||
{
|
||||
@ -2793,24 +2819,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
uint32_t const nbQHeads = nbKHeads * headGrpSize;
|
||||
|
||||
// const uint32_t nbSubSeqPerSeq = allowMultiBlockMode ? DBG_NB_CTAS_PER_SEQ : 1;
|
||||
uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t
|
||||
{
|
||||
if (!allowMultiBlockMode)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
auto const env = std::getenv("XQA_NB_SUB_SEQ");
|
||||
if (env != nullptr)
|
||||
{
|
||||
int32_t const val = std::stoi(env);
|
||||
if (val > 0)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
}
|
||||
return std::min<uint32_t>(
|
||||
std::max<uint32_t>(1U, prop.multiProcessorCount / (batchSize * nbKHeads)), divUp(maxSeqLen, ctaTile.x));
|
||||
}();
|
||||
uint32_t const nbSubSeqPerSeq = computeNbSubSeqPerSeqMHA(prop, batchSize, nbKHeads, maxSeqLen);
|
||||
// gridDim.z == batchSize && gridDim.y == nbKHeads && gridDim.x == nbSubSeqPerSeq
|
||||
#if SPEC_DEC
|
||||
const uint32_t nbTokenBlocksPerGrp = divUp(qSeqLen * headGrpSize, rowsPerBlock);
|
||||
|
||||
@ -90,6 +90,9 @@ struct BeamSearchParams
|
||||
// match trt-llm API.
|
||||
};
|
||||
|
||||
uint32_t computeNbSubSeqPerSeqMHA(
|
||||
cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen);
|
||||
|
||||
void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
|
||||
#if SLIDING_WINDOW
|
||||
uint32_t slidingWinSize,
|
||||
@ -127,9 +130,18 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
|
||||
// int8/fp8 KV cache.
|
||||
#if SPEC_DEC
|
||||
SpecDecParams const& specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
float const skipSoftmaxThresholdScaleFactor,
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
|
||||
#endif
|
||||
#endif
|
||||
uint32_t* semaphores, void* scratch, cudaStream_t stream);
|
||||
|
||||
uint32_t computeNbSubSeqPerSeqHopperF8MHA(
|
||||
cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen);
|
||||
|
||||
void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
#if SLIDING_WINDOW
|
||||
uint32_t slidingWinSize,
|
||||
@ -167,6 +179,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
// int8/fp8 KV cache.
|
||||
#if SPEC_DEC
|
||||
SpecDecParams const& specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
float const skipSoftmaxThresholdScaleFactor,
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
|
||||
#endif
|
||||
#endif
|
||||
uint32_t* semaphores, void* scratch, cudaStream_t stream);
|
||||
|
||||
|
||||
@ -49,6 +49,10 @@ static_assert(specDecQLen * headGrpSize <= 32, "SPEC_Q_SEQ_LEN macro value is to
|
||||
#define SWAP_AB (!SPEC_DEC)
|
||||
#endif
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
static_assert(SWAP_AB && USE_PAGED_KV_CACHE && !SPEC_DEC && BEAM_WIDTH == 1, "SKIP_SOFTMAX_ATTN is not supported.");
|
||||
#endif
|
||||
|
||||
#define IS_SUPPORTED_F16_CASE (CACHE_ELEM_ENUM == 0 && !SPEC_DEC && SWAP_AB && !USE_INPUT_KV && !LOW_PREC_OUTPUT)
|
||||
|
||||
inline constexpr bool swapAB = SWAP_AB;
|
||||
@ -138,26 +142,38 @@ using PaddedOutHead = PaddedInputHead;
|
||||
|
||||
struct alignas(128) SharedMem
|
||||
{
|
||||
using QBuffer = Vec<Array2D<LdGrain, ctaNbQHeads, grainsPerQPart>, nbQParts>;
|
||||
using KBuffer = Array2D<LdGrain, gemm0CtaTileNbTokens, exactDiv(cacheHeadPartBytes, grainBytes)>;
|
||||
static constexpr uint32_t nbKBuf = 2;
|
||||
KBuffer k[nbKBuf]; // as is loaded from global mem.
|
||||
using XBuffer = Vec<Array2D<LdGrain, ctaNbQHeads, grainsPerXPart>, nbXParts>;
|
||||
static constexpr uint32_t nbXBuf
|
||||
= 2 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens));
|
||||
using VBuffer = Vec<Array2D<LdGrain, gemm1CtaTileNbTokens, exactDiv(cacheHeadPartBytes, grainBytes),
|
||||
sizeof(XBuffer) % (cacheHeadPartBytes * 8) == 0>,
|
||||
cacheHeadNbParts>;
|
||||
#if !SWAP_AB
|
||||
using VTBuffer = Array2D<LdGrain, headElems, exactDiv(gemm1CtaTileNbTokens, cacheElemsPerGrain), true>;
|
||||
#endif
|
||||
static constexpr uint32_t nbVBuf = 2;
|
||||
#if CACHE_ELEM_ENUM == 0
|
||||
using OutSwizzleBuf = Array2D<LdGrain, ctaNbQHeads, grainsPerPaddedInputHead>;
|
||||
#elif CACHE_ELEM_ENUM == 2
|
||||
using OutSwizzleBuf = Array2D<Vec<Vec<InputElem, 4>, 4>, ctaNbQHeads, exactDiv(headElems, 4 * 4)>;
|
||||
#endif
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
static constexpr uint32_t nbKBuf = 2;
|
||||
static constexpr uint32_t nbVBuf = 3; // @fixme: skip_softmax_attn: for skip softmax attn, an extra VBuffer is used
|
||||
static constexpr uint32_t nbXBuf
|
||||
= 3 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens));
|
||||
#else
|
||||
static constexpr uint32_t nbKBuf = 2;
|
||||
static constexpr uint32_t nbVBuf = 2;
|
||||
static constexpr uint32_t nbXBuf
|
||||
= 2 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens));
|
||||
#endif
|
||||
static_assert(nbXBuf == nbVBuf);
|
||||
|
||||
// note: buffers used for GMMA may have additional alignment requirements
|
||||
KBuffer k[nbKBuf]; // as is loaded from global mem.
|
||||
QBuffer q; // For gmma math. Conversion done if needed.
|
||||
|
||||
union ReusedXVOutSwizzleBuf
|
||||
{
|
||||
struct XV
|
||||
@ -196,9 +212,6 @@ struct alignas(128) SharedMem
|
||||
return reusedXVOutSwizzleBuf[i].outSwizzle;
|
||||
}
|
||||
|
||||
using QBuffer = Vec<Array2D<LdGrain, ctaNbQHeads, grainsPerQPart>, nbQParts>;
|
||||
QBuffer q; // For gmma math. Conversion done if needed.
|
||||
|
||||
// @fixme: move these into reusedXVOutSwizzleBuf
|
||||
#if SWAP_AB
|
||||
ShmQWiseVec xColMax[nbXBuf];
|
||||
@ -220,6 +233,11 @@ struct alignas(128) SharedMem
|
||||
Vec<KVCachePageIndex, nbPagesPerTile> pages[2]; // one for K and one for V
|
||||
#endif
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
uint32_t skipSoftmaxVotesGemm0ToV[nbXBuf]; // guarded by skipSoftmaxXBar
|
||||
uint32_t skipSoftmaxVotesGemm0ToGemm1[nbXBuf]; // guarded by xBar
|
||||
#endif
|
||||
|
||||
// mem barriers
|
||||
|
||||
CtaBarrierPair qBar;
|
||||
@ -229,6 +247,9 @@ struct alignas(128) SharedMem
|
||||
CtaBarrierPair vtBar[nbVBuf];
|
||||
#endif
|
||||
CtaBarrierPair xBar[nbXBuf];
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
CtaBarrierPair skipSoftmaxXBar[nbXBuf]; // for V to wait for X to be ready
|
||||
#endif
|
||||
|
||||
// used internally in the gemm0 warp group
|
||||
// @fixme: use separate arrive and wait for all usage
|
||||
@ -425,8 +446,13 @@ __device__ void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
|
||||
#endif
|
||||
|
||||
#if SWAP_AB
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
__device__ RegColWiseVec computeWarpGrpColMax_sync(CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src,
|
||||
float skipSoftmaxThreshold, uint32_t* smemSkipVote, bool maybeSkip);
|
||||
#else
|
||||
__device__ RegColWiseVec computeWarpGrpColMax_sync(
|
||||
CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src);
|
||||
#endif
|
||||
__device__ void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32_t validRowBeg, uint32_t validRowEnd);
|
||||
__device__ void warpGrpOnlineSoftmax(Gemm0Acc& acc, RegColWiseVec const& colMax);
|
||||
__device__ RegColWiseVec computeWarpColSum(Gemm0Acc& src);
|
||||
@ -675,6 +701,12 @@ CUBIN_EXPORT __global__
|
||||
#endif
|
||||
#if SPEC_DEC
|
||||
SpecDecParams const specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
float const skipSoftmaxThresholdScaleFactor,
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
|
||||
#endif
|
||||
#endif
|
||||
uint32_t* __restrict__ const semaphores
|
||||
= nullptr, // [nbReq][nbKHeads][divUp(specDecParams.qSeqLen, inputTokensPerCta)]
|
||||
@ -753,6 +785,10 @@ CUBIN_EXPORT __global__
|
||||
uint32_t const nbSubSeq = isMultiBlockMode ? mha::min(nbTilesInUse / multiBlockMinNbTilesPerCta, maxNbSubSeq) : 1;
|
||||
static_assert(multiBlockMinNbTiles >= multiBlockMinNbTilesPerCta * 2);
|
||||
assert(isMultiBlockMode == (nbSubSeq > 1));
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
bool const disableSkipForShortSeq = (cacheSeqLen < skipSoftmaxThresholdScaleFactor);
|
||||
float const skipSoftmaxThreshold = disableSkipForShortSeq ? 0.0f : skipSoftmaxThresholdScaleFactor / cacheSeqLen;
|
||||
#endif
|
||||
if (idxSubSeq >= nbSubSeq)
|
||||
{
|
||||
return;
|
||||
@ -776,21 +812,34 @@ CUBIN_EXPORT __global__
|
||||
assert(dynamicSmemSize() >= sizeof(SharedMem));
|
||||
SharedMem& smem = *reinterpret_cast<SharedMem*>(&smemByteBuf[0]);
|
||||
|
||||
constexpr uint32_t nbBuffers = 2;
|
||||
static_assert(nbBuffers == SharedMem::nbKBuf && nbBuffers == SharedMem::nbVBuf && nbBuffers == SharedMem::nbXBuf);
|
||||
if (wid < nbBuffers)
|
||||
constexpr uint32_t maxNbBuffers = (SharedMem::nbXBuf > SharedMem::nbVBuf) ? SharedMem::nbXBuf : SharedMem::nbVBuf;
|
||||
static_assert(
|
||||
maxNbBuffers >= SharedMem::nbKBuf && maxNbBuffers >= SharedMem::nbVBuf && maxNbBuffers >= SharedMem::nbXBuf);
|
||||
if (wid < maxNbBuffers)
|
||||
{
|
||||
if (warpElectSync())
|
||||
{
|
||||
smem.kBar[wid].initialize(gemm0NbThrds, gemm0NbThrds + warp_size);
|
||||
smem.vBar[wid].initialize(gemm1NbThrds, gemm1NbThrds + warp_size);
|
||||
#if !SWAP_AB
|
||||
smem.vtBar[wid].initialize(gemm1NbThrds * 2, gemm1NbThrds * 2);
|
||||
if (wid < SharedMem::nbKBuf)
|
||||
{
|
||||
smem.kBar[wid].initialize(gemm0NbThrds, gemm0NbThrds + warp_size);
|
||||
}
|
||||
if (wid < SharedMem::nbXBuf)
|
||||
{
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
smem.skipSoftmaxXBar[wid].initialize(gemm0NbThrds + warp_size, gemm0NbThrds + warp_size);
|
||||
smem.vBar[wid].initialize(gemm1NbThrds + warp_size, gemm1NbThrds + warp_size);
|
||||
#else
|
||||
smem.vBar[wid].initialize(gemm1NbThrds, gemm1NbThrds + warp_size);
|
||||
#endif
|
||||
smem.xBar[wid].initialize(gemm0NbThrds + gemm1NbThrds, gemm0NbThrds + gemm1NbThrds);
|
||||
|
||||
#if !SWAP_AB
|
||||
smem.vtBar[wid].initialize(gemm1NbThrds * 2, gemm1NbThrds * 2);
|
||||
#endif
|
||||
smem.xBar[wid].initialize(gemm0NbThrds + gemm1NbThrds, gemm0NbThrds + gemm1NbThrds);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (wid == nbBuffers)
|
||||
else if (wid == maxNbBuffers)
|
||||
{
|
||||
if (warpElectSync())
|
||||
{
|
||||
@ -819,6 +868,10 @@ CUBIN_EXPORT __global__
|
||||
SpecDec const specDec{specDecParams, idxReq, idxInputSubSeq, cacheSeqLen};
|
||||
#endif
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
uint32_t localSkippedBlockCount = 0;
|
||||
#endif
|
||||
|
||||
// QK gemm
|
||||
constexpr uint32_t nbGmmaInstM = exactDiv(gemm0CtaTileNbTokens, gmma::instM);
|
||||
using Acc = GmmaAcc<gemm0CtaTileNbTokens, ctaNbQHeads>;
|
||||
@ -940,10 +993,39 @@ CUBIN_EXPORT __global__
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t const idxXBuf = idxIter % SharedMem::nbXBuf;
|
||||
auto& xBar = smem.xBar[idxXBuf];
|
||||
// update colMax in shared mem and get a register copy
|
||||
#if SWAP_AB
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
auto& skipSoftmaxXBar = smem.skipSoftmaxXBar[idxXBuf];
|
||||
skipSoftmaxXBar.consumed.arrive_and_wait();
|
||||
|
||||
bool const maybeSkip = !disableSkipForShortSeq && idxIter != 0;
|
||||
RegColWiseVec const colMax = computeWarpGrpColMax_sync(smem.gemm0WarpGrpBar, smem.gemm0CurrentSeqMax, acc,
|
||||
skipSoftmaxThreshold, &smem.skipSoftmaxVotesGemm0ToV[idxXBuf], maybeSkip);
|
||||
bool const shouldSkipSoftmaxAttn = static_cast<bool>(smem.skipSoftmaxVotesGemm0ToV[idxXBuf]);
|
||||
unused(skipSoftmaxXBar.produced.arrive());
|
||||
warpGrpOnlineSoftmax(acc, colMax);
|
||||
if (shouldSkipSoftmaxAttn)
|
||||
{
|
||||
xBar.consumed.arrive_and_wait();
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 1U;
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
localSkippedBlockCount++;
|
||||
#endif
|
||||
}
|
||||
asm volatile("fence.proxy.async.shared::cta;\n"); // maybe not used
|
||||
unused(xBar.produced.arrive());
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
RegColWiseVec const colMax = computeWarpGrpColMax_sync(smem.gemm0WarpGrpBar, smem.gemm0CurrentSeqMax, acc);
|
||||
warpGrpOnlineSoftmax(acc, colMax);
|
||||
#endif
|
||||
#else
|
||||
RegRowWiseVec const rowMax = computeWarpGrpRowMax_sync(warpRank, smem.gemm0CurrentSeqMax, acc);
|
||||
warpGrpOnlineSoftmax(acc, rowMax);
|
||||
@ -959,8 +1041,6 @@ CUBIN_EXPORT __global__
|
||||
// map 1 to fp8_max before conversion to fp8
|
||||
acc = acc * kE4M3_MAX;
|
||||
|
||||
uint32_t const idxXBuf = idxIter % SharedMem::nbXBuf;
|
||||
auto& xBar = smem.xBar[idxXBuf];
|
||||
// @fixme: for fp16/bf16, try not to transpose acc here, and leave it to the next GEMM.
|
||||
#if SWAP_AB
|
||||
storeGemm0AccToShm(warpRank, laneId(), smem.xBuf(idxXBuf), xBar.consumed, acc);
|
||||
@ -989,13 +1069,25 @@ CUBIN_EXPORT __global__
|
||||
storeShmRowWiseVec(warpRank, smem.xRowMax[idxXBuf], rowMax);
|
||||
storeShmRowWiseVec(warpRank, smem.xRowSum[idxXBuf], rowSum);
|
||||
#endif
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 0;
|
||||
}
|
||||
#endif
|
||||
__syncwarp();
|
||||
// the release semantics of arrive does not work for async consumers like gmma. additional fence is
|
||||
// needed.
|
||||
asm volatile("fence.proxy.async.shared::cta;\n");
|
||||
unused(xBar.produced.arrive());
|
||||
}
|
||||
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
if (threadIdx.x == 0 && skippedBlockCount != nullptr && totalBlockCount != nullptr)
|
||||
{
|
||||
atomicAdd(skippedBlockCount, localSkippedBlockCount);
|
||||
atomicAdd(totalBlockCount, nbIters);
|
||||
}
|
||||
#endif
|
||||
unused(smem.qBar.consumed.arrive());
|
||||
}
|
||||
else if (warpIdx.z == 1)
|
||||
@ -1043,216 +1135,231 @@ CUBIN_EXPORT __global__
|
||||
uint32_t idxVTile = idxVTileInit + idxIter * nbSubSeq;
|
||||
auto const idxVBuf = idxIter % SharedMem::nbVBuf;
|
||||
auto const idxXBuf = idxVBuf;
|
||||
auto& vBar = smem.vBar[idxVBuf];
|
||||
arrive_tx_and_wait(vBar.produced, exactDiv(sizeof(SharedMem::VBuffer), gemm1NbThrds));
|
||||
auto const& vBuf = smem.vBuf(idxVBuf);
|
||||
#if !SWAP_AB
|
||||
CtaBarrierPair& vtBar = smem.vtBar[idxVBuf];
|
||||
auto& vtBuf = smem.vtBuf(idxVBuf);
|
||||
vtBar.consumed.arrive_and_wait();
|
||||
transposeVTile(warpRank, laneId(), vtBuf, vBuf);
|
||||
vBar.consumed.arrive();
|
||||
vtBar.produced.arrive();
|
||||
#endif
|
||||
auto& xBar = smem.xBar[idxXBuf];
|
||||
auto& vBar = smem.vBar[idxVBuf];
|
||||
auto const& vBuf = smem.vBuf(idxVBuf);
|
||||
xBar.produced.arrive_and_wait();
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
bool shouldSkipSoftmaxAttn = smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf]; // guarded by xBar
|
||||
if (shouldSkipSoftmaxAttn)
|
||||
{
|
||||
vBar.produced.arrive_and_wait();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
if (!shouldSkipSoftmaxAttn) // skip XVGemm
|
||||
#endif
|
||||
{
|
||||
arrive_tx_and_wait(vBar.produced, exactDiv(sizeof(SharedMem::VBuffer), gemm1NbThrds));
|
||||
#if !SWAP_AB
|
||||
CtaBarrierPair& vtBar = smem.vtBar[idxVBuf];
|
||||
auto& vtBuf = smem.vtBuf(idxVBuf);
|
||||
vtBar.consumed.arrive_and_wait();
|
||||
transposeVTile(warpRank, laneId(), vtBuf, vBuf);
|
||||
vBar.consumed.arrive();
|
||||
vtBar.produced.arrive();
|
||||
#endif
|
||||
#if !defined(NDEBUG) && DBG_PRINT
|
||||
#if SWAP_AB
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
printf("colMax:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xColMax[idxXBuf][i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("colSum:\n");
|
||||
for (int n = 0; n < 4; n++)
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
printf("colMax:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xColSum[idxXBuf][n][i]);
|
||||
printf("%f, ", smem.xColMax[idxXBuf][i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("colSum:\n");
|
||||
for (int n = 0; n < 4; n++)
|
||||
{
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xColSum[idxXBuf][n][i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf("X:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
for (int j = 0; j < gemm0CtaTileNbTokens; j++)
|
||||
{
|
||||
auto const& elemsPerXPart = (cacheElemsPerGrain * grainsPerXPart);
|
||||
auto const e = reinterpret_cast<Vec<__nv_fp8_e4m3, 16>&>(
|
||||
smem.xBuf(idxXBuf)[j / elemsPerXPart].template at<true>(
|
||||
i, j % elemsPerXPart / cacheElemsPerGrain))[j % cacheElemsPerGrain];
|
||||
printf("%.2f, ", float(e));
|
||||
if (j % 16 == 15)
|
||||
{
|
||||
printf("| ");
|
||||
}
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
#else
|
||||
if (blockIdx.y == 1 && threadIdx.x == 0)
|
||||
{
|
||||
printf("rowMax:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xRowMax[idxXBuf][i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("rowSum:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xRowSum[idxXBuf][i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf("X:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
for (int j = 0; j < gemm0CtaTileNbTokens; j++)
|
||||
{
|
||||
auto const& elemsPerXPart = (cacheElemsPerGrain * grainsPerXPart);
|
||||
auto const e = reinterpret_cast<Vec<__nv_fp8_e4m3, 16>&>(
|
||||
smem.xBuf(idxXBuf)[j / elemsPerXPart].template at<true>(
|
||||
i, j % elemsPerXPart / cacheElemsPerGrain))[j % cacheElemsPerGrain];
|
||||
printf("%.2f, ", float(e));
|
||||
if (j % 16 == 15)
|
||||
{
|
||||
printf("| ");
|
||||
}
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
#else
|
||||
if (blockIdx.y == 1 && threadIdx.x == 0)
|
||||
{
|
||||
printf("rowMax:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xRowMax[idxXBuf][i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("rowSum:\n");
|
||||
for (int i = 0; i < ctaNbQHeads; i++)
|
||||
{
|
||||
printf("%f, ", smem.xRowSum[idxXBuf][i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if SWAP_AB
|
||||
// @fixme: if first tile, no need to rescale acc. For persistent CTA, just re-initialize acc instead.
|
||||
rescaleGemm1AccForNewColMax_sync(warpRank, smem.xColMax[idxXBuf], smem.xColSum[idxXBuf],
|
||||
smem.gemm1AccColMax, acc, smem.gemm1AccColSum, smem.gemm1WarpGrpBar);
|
||||
// @fixme: if first tile, no need to rescale acc. For persistent CTA, just re-initialize acc instead.
|
||||
rescaleGemm1AccForNewColMax_sync(warpRank, smem.xColMax[idxXBuf], smem.xColSum[idxXBuf],
|
||||
smem.gemm1AccColMax, acc, smem.gemm1AccColSum, smem.gemm1WarpGrpBar);
|
||||
#else
|
||||
rescaleGemm1AccForNewRowMax_sync(
|
||||
warpRank, smem.xRowMax[idxXBuf], smem.xRowSum[idxXBuf], smem.gemm1AccColMax, acc, smem.gemm1AccColSum);
|
||||
rescaleGemm1AccForNewRowMax_sync(warpRank, smem.xRowMax[idxXBuf], smem.xRowSum[idxXBuf],
|
||||
smem.gemm1AccColMax, acc, smem.gemm1AccColSum);
|
||||
#endif
|
||||
auto& xBuf = smem.xBuf(idxXBuf);
|
||||
auto& xBuf = smem.xBuf(idxXBuf);
|
||||
|
||||
auto const descXBase = gmma::makeMatDesc(nullptr, 0, SharedMem::XBuffer::Elem::rowBytes * 8,
|
||||
gmma::getSwizzleMode<true>(SharedMem::XBuffer::Elem{}))
|
||||
.raw();
|
||||
auto const descXBase = gmma::makeMatDesc(nullptr, 0, SharedMem::XBuffer::Elem::rowBytes * 8,
|
||||
gmma::getSwizzleMode<true>(SharedMem::XBuffer::Elem{}))
|
||||
.raw();
|
||||
#if CACHE_ELEM_ENUM == 0
|
||||
auto const descVBase = gmma::makeMatDesc(nullptr, 0, SharedMem::VBuffer::Elem::rowBytes * 8,
|
||||
gmma::getSwizzleMode<true>(SharedMem::VBuffer::Elem{}))
|
||||
.raw();
|
||||
auto const descVBase = gmma::makeMatDesc(nullptr, 0, SharedMem::VBuffer::Elem::rowBytes * 8,
|
||||
gmma::getSwizzleMode<true>(SharedMem::VBuffer::Elem{}))
|
||||
.raw();
|
||||
#endif
|
||||
#if SWAP_AB
|
||||
//@fixme: to reduce code size, we can disable unroll and use double-buffer for LDSM in loadVTileTransposed.
|
||||
#pragma unroll
|
||||
for (uint32_t idxInstK = 0; idxInstK < gemm1NbGmmaInstK; idxInstK++)
|
||||
{
|
||||
for (uint32_t idxInstK = 0; idxInstK < gemm1NbGmmaInstK; idxInstK++)
|
||||
{
|
||||
#if CACHE_ELEM_ENUM == 2
|
||||
Vec<RegMatAFrag, gemm1NbGmmaInstM> const fragA
|
||||
= loadVTileTransposed(warpRank, laneId(), vBuf, idxInstK);
|
||||
Vec<RegMatAFrag, gemm1NbGmmaInstM> const fragA
|
||||
= loadVTileTransposed(warpRank, laneId(), vBuf, idxInstK);
|
||||
#if !defined(NDEBUG) && DBG_PRINT
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
printf("fragA:\nidxInstK == %u\n", idxInstK);
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
for (int m = 0; m < 2; m++)
|
||||
{
|
||||
for (int w = 0; w < 4; w++)
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
if (warpRank == w)
|
||||
printf("fragA:\nidxInstK == %u\n", idxInstK);
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
for (int m = 0; m < 2; m++)
|
||||
{
|
||||
for (int w = 0; w < 4; w++)
|
||||
{
|
||||
if (laneId() == 0)
|
||||
if (warpRank == w)
|
||||
{
|
||||
printf(" warpRank = %u\n", warpRank);
|
||||
}
|
||||
__syncwarp();
|
||||
for (int a = 0; a < 2; a++)
|
||||
{
|
||||
for (int b = 0; b < 8; b++)
|
||||
if (laneId() == 0)
|
||||
{
|
||||
for (int c = 0; c < 2; c++)
|
||||
printf(" warpRank = %u\n", warpRank);
|
||||
}
|
||||
__syncwarp();
|
||||
for (int a = 0; a < 2; a++)
|
||||
{
|
||||
for (int b = 0; b < 8; b++)
|
||||
{
|
||||
for (int d = 0; d < 4; d++)
|
||||
for (int c = 0; c < 2; c++)
|
||||
{
|
||||
if (laneId() == b * 4 + d)
|
||||
for (int d = 0; d < 4; d++)
|
||||
{
|
||||
for (int e = 0; e < 4; e++)
|
||||
if (laneId() == b * 4 + d)
|
||||
{
|
||||
auto const& elem4 = reinterpret_cast<__nv_fp8_e4m3 const(&)[4]>(
|
||||
fragA[m](0, c)(a, 0));
|
||||
printf("%.2f, ", float(elem4[e]));
|
||||
for (int e = 0; e < 4; e++)
|
||||
{
|
||||
auto const& elem4 = reinterpret_cast<__nv_fp8_e4m3 const(&)[4]>(
|
||||
fragA[m](0, c)(a, 0));
|
||||
printf("%.2f, ", float(elem4[e]));
|
||||
}
|
||||
}
|
||||
__syncwarp();
|
||||
}
|
||||
__syncwarp();
|
||||
}
|
||||
if (laneId() == 0)
|
||||
{
|
||||
printf("\n");
|
||||
}
|
||||
__syncwarp();
|
||||
}
|
||||
if (laneId() == 0)
|
||||
if (laneId() == 0 && a == 0)
|
||||
{
|
||||
printf("\n");
|
||||
printf("----------------------\n");
|
||||
}
|
||||
__syncwarp();
|
||||
}
|
||||
if (laneId() == 0 && a == 0)
|
||||
{
|
||||
printf("----------------------\n");
|
||||
}
|
||||
__syncwarp();
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
}
|
||||
smem.gemm1WarpGrpBar.arrive_and_wait();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * idxInstK};
|
||||
auto const descX = addAddr(descXBase,
|
||||
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
|
||||
0, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
|
||||
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * idxInstK};
|
||||
auto const descX = addAddr(descXBase,
|
||||
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
|
||||
0, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
|
||||
#if CACHE_ELEM_ENUM == 2
|
||||
gmma::fence();
|
||||
gmma::fence();
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (uint32_t idxInstM = 0; idxInstM < gemm1NbGmmaInstM; idxInstM++)
|
||||
{
|
||||
for (uint32_t idxInstM = 0; idxInstM < gemm1NbGmmaInstM; idxInstM++)
|
||||
{
|
||||
#if CACHE_ELEM_ENUM == 0
|
||||
auto const descV
|
||||
= addAddr(descVBase, &vBuf[idxInstM](kOffsetInGrains.get() * cacheElemsPerGrain, 0));
|
||||
gmma::mma_async_shmA<MathElem, ctaNbQHeads, true, false>(
|
||||
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
|
||||
descV, descX, true);
|
||||
auto const descV
|
||||
= addAddr(descVBase, &vBuf[idxInstM](kOffsetInGrains.get() * cacheElemsPerGrain, 0));
|
||||
gmma::mma_async_shmA<MathElem, ctaNbQHeads, true, false>(
|
||||
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
|
||||
descV, descX, true);
|
||||
#elif CACHE_ELEM_ENUM == 2
|
||||
gmma::mma_async_regA<MathElem, ctaNbQHeads>(
|
||||
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
|
||||
reinterpret_cast<uint32_t const(&)[2][2][1]>(fragA[idxInstM]), descX, true);
|
||||
gmma::mma_async_regA<MathElem, ctaNbQHeads>(
|
||||
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
|
||||
reinterpret_cast<uint32_t const(&)[2][2][1]>(fragA[idxInstM]), descX, true);
|
||||
#endif
|
||||
}
|
||||
gmma::commit_group();
|
||||
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of
|
||||
// gmma.
|
||||
gmma::wait_group<0>();
|
||||
}
|
||||
gmma::commit_group();
|
||||
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of
|
||||
// gmma.
|
||||
gmma::wait_group<0>();
|
||||
}
|
||||
#else
|
||||
auto const descVTBase = gmma::makeMatDesc(
|
||||
nullptr, 0, SharedMem::VTBuffer::rowBytes * 8, gmma::getSwizzleMode<true>(SharedMem::VTBuffer{}))
|
||||
.raw();
|
||||
vtBar.produced.arrive_and_wait();
|
||||
auto const descVTBase = gmma::makeMatDesc(
|
||||
nullptr, 0, SharedMem::VTBuffer::rowBytes * 8, gmma::getSwizzleMode<true>(SharedMem::VTBuffer{}))
|
||||
.raw();
|
||||
vtBar.produced.arrive_and_wait();
|
||||
// if (idxIter == 1 && threadIdx.x == 0) {
|
||||
// printf("vtBuf:\n");
|
||||
// dbg::printArray2D<__nv_fp8_e4m3, true>(vtBuf);
|
||||
// }
|
||||
#pragma unroll
|
||||
for (uint32_t m = 0; m < Gemm1Acc::rows; m++)
|
||||
{
|
||||
#pragma unroll
|
||||
for (uint32_t k = 0; k < gemm1NbGmmaInstK; k++)
|
||||
for (uint32_t m = 0; m < Gemm1Acc::rows; m++)
|
||||
{
|
||||
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * k};
|
||||
auto const descX = addAddr(descXBase,
|
||||
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
|
||||
gmma::instM * m, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
|
||||
auto const descVT = addAddr(
|
||||
descVTBase, &vtBuf(0, kOffsetInGrains.template mod<SharedMem::VTBuffer::cols>().get()));
|
||||
gmma::mma_async_shmA<MathElem, headElems>(
|
||||
reinterpret_cast<float(&)[exactDiv(headElems, gmma::instNBase)][2][2]>(acc(m, 0)), descX,
|
||||
descVT, true);
|
||||
#pragma unroll
|
||||
for (uint32_t k = 0; k < gemm1NbGmmaInstK; k++)
|
||||
{
|
||||
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * k};
|
||||
auto const descX = addAddr(descXBase,
|
||||
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
|
||||
gmma::instM * m, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
|
||||
auto const descVT = addAddr(
|
||||
descVTBase, &vtBuf(0, kOffsetInGrains.template mod<SharedMem::VTBuffer::cols>().get()));
|
||||
gmma::mma_async_shmA<MathElem, headElems>(
|
||||
reinterpret_cast<float(&)[exactDiv(headElems, gmma::instNBase)][2][2]>(acc(m, 0)), descX,
|
||||
descVT, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
gmma::commit_group();
|
||||
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of gmma.
|
||||
gmma::wait_group<0>();
|
||||
gmma::commit_group();
|
||||
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of
|
||||
// gmma.
|
||||
gmma::wait_group<0>();
|
||||
#endif
|
||||
}
|
||||
|
||||
if (idxIter == nbIters - 1)
|
||||
{
|
||||
// gmma::wait_group should have already synchronized threads, so this may be unnecessary.
|
||||
@ -1471,8 +1578,24 @@ CUBIN_EXPORT __global__
|
||||
tensorMap
|
||||
#endif
|
||||
};
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
for (auto& b : smem.skipSoftmaxXBar)
|
||||
{
|
||||
unused(b.consumed.arrive());
|
||||
}
|
||||
#endif
|
||||
for (uint32_t idxIter = 0; idxIter < nbIters; idxIter++)
|
||||
{
|
||||
uint32_t const idxVBuf = idxIter % SharedMem::nbVBuf;
|
||||
auto& vBar = smem.vBar[idxVBuf];
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
uint32_t idxXBuf = idxIter % SharedMem::nbXBuf;
|
||||
auto& skipSoftmaxXBar = smem.skipSoftmaxXBar[idxXBuf];
|
||||
skipSoftmaxXBar.produced.arrive_and_wait();
|
||||
bool shouldSkipSoftmaxAttn = smem.skipSoftmaxVotesGemm0ToV[idxXBuf];
|
||||
skipSoftmaxXBar.consumed.arrive();
|
||||
#endif
|
||||
|
||||
uint32_t const idxVTile = idxVTileInit + idxIter * nbSubSeq;
|
||||
vTileLoader.loadPages(idxVTile);
|
||||
#if USE_INPUT_KV || ENABLE_PDL == 2
|
||||
@ -1506,8 +1629,20 @@ CUBIN_EXPORT __global__
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t const idxVBuf = idxIter % SharedMem::nbVBuf;
|
||||
auto& vBar = smem.vBar[idxVBuf];
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
if (shouldSkipSoftmaxAttn)
|
||||
{
|
||||
vBar.consumed.arrive_and_wait();
|
||||
// compared to non-skip softmax attn, we need to increase vBar.produced count to avoid race
|
||||
// condition where vBar.consumed is arrived again without wait without skip softmax attn, XVGemm
|
||||
// will wait for tx_count, so its progress won't go ahead of vload warp with skip softmax attn,
|
||||
// XVGemm WG may go ahead of vload warp, as previous vBar only have XVGemm WG threads and a tx_count
|
||||
// (now = 0). Then it may arrive vBar.consumed before it is arrive_and_wait-ed
|
||||
vBar.produced.arrive();
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
vBar.consumed.arrive_and_wait();
|
||||
if (warpElectSync())
|
||||
{
|
||||
@ -1517,6 +1652,9 @@ CUBIN_EXPORT __global__
|
||||
vTileLoader.loadData(smem.vBuf(idxVBuf)[idxPart], idxVTile, idxPart, vBar.produced);
|
||||
}
|
||||
}
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
vBar.produced.arrive();
|
||||
#endif
|
||||
__syncwarp();
|
||||
}
|
||||
}
|
||||
@ -1992,9 +2130,23 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
|
||||
#endif // SPEC_DEC
|
||||
|
||||
// smemColMax is persistent across multiple iterations
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
__device__ inline RegColWiseVec computeWarpGrpColMax_sync(CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax,
|
||||
Gemm0Acc const& src, float skipSoftmaxThreshold, uint32_t* smemSkipVote, bool maybeSkip)
|
||||
#else
|
||||
__device__ inline RegColWiseVec computeWarpGrpColMax_sync(
|
||||
CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src)
|
||||
#endif
|
||||
{
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
*smemSkipVote = maybeSkip ? 1U : 0U; // will sync before vote
|
||||
}
|
||||
float const lnThreshold
|
||||
= log(skipSoftmaxThreshold); // this can be -inf, but should be safe as we only use it for comparison
|
||||
#endif
|
||||
|
||||
auto colMax = RegColWiseVec::filled(Vec<float, 2>::filled(safeInitRowMax));
|
||||
#pragma unroll
|
||||
for (uint32_t n = 0; n < src.cols; n++)
|
||||
@ -2029,6 +2181,9 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync(
|
||||
}
|
||||
|
||||
uint32_t const lane = laneId();
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
auto prevOrCurrentMax = RegColWiseVec();
|
||||
#if SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
|
||||
if (lane < 4)
|
||||
{
|
||||
#pragma unroll
|
||||
@ -2037,12 +2192,43 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync(
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < 2; j++)
|
||||
{
|
||||
atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]);
|
||||
prevOrCurrentMax[n][j] = smemColMax[8 * n + 2 * lane + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
warpGrpBar.arrive_and_wait();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (lane < 4)
|
||||
{
|
||||
#pragma unroll
|
||||
for (uint32_t n = 0; n < src.cols; n++)
|
||||
{
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < 2; j++)
|
||||
{
|
||||
#if SKIP_SOFTMAX_ATTN && !SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
|
||||
// prevOrCurrentMax <= actual smemColMax (after updates from all 4 warps done), but always >=
|
||||
// smemColMax(Prev), the smemColMax value *before* this tile is computed.
|
||||
// When determine whether to skip, it is safe to use prevOrCurrentMax: 1) all 4 warps' localmax <
|
||||
// smemColMax(Prev), then prevOrCurrentMax == smemColMax(Prev), result not affected; 2) if some localmax
|
||||
// > smemColMax(Prev), prevOrCurrentMax > smemColMax(Prev), some warps may incorrectly vote skip, but
|
||||
// at least one warp whose localColMax is larger will not skip, then the tile is not skipped.
|
||||
// This reduces some sync and check, but has issue when threshold > 1.
|
||||
prevOrCurrentMax[n][j] = atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]);
|
||||
#else
|
||||
atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
warpGrpBar.arrive_and_wait();
|
||||
|
||||
uint32_t const idxInQuad = lane % 4;
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
bool localShouldSkip = true;
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (uint32_t n = 0; n < src.cols; n++)
|
||||
@ -2050,10 +2236,21 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync(
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++)
|
||||
{
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
if (lane < 4 && 8 * n + 2 * idxInQuad + j < headGrpSize)
|
||||
{
|
||||
localShouldSkip &= (colMax[n][j] - prevOrCurrentMax[n][j]) < lnThreshold;
|
||||
}
|
||||
#endif
|
||||
assert(colMax[n][j] <= smemColMax[8 * n + 2 * idxInQuad + j]);
|
||||
colMax[n][j] = smemColMax[8 * n + 2 * idxInQuad + j];
|
||||
}
|
||||
}
|
||||
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
atomicAnd(smemSkipVote, static_cast<uint32_t>(localShouldSkip)); // this will be translated to redux and voteu
|
||||
#endif
|
||||
|
||||
warpGrpBar.arrive_and_wait();
|
||||
return colMax;
|
||||
}
|
||||
@ -2199,7 +2396,7 @@ __device__ inline void storeGemm0AccToShm(
|
||||
uint32_t const idxOctInsideHalf = idxInHalf / 8;
|
||||
uint32_t const idxRowInsideOct = lane % 8;
|
||||
uint32_t const warpBaseC = 16 * warpRank;
|
||||
auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> std::pair<uint32_t, uint32_t>
|
||||
auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> mha::pair<uint32_t, uint32_t>
|
||||
{
|
||||
uint32_t const accR = idxAccCoreMat / Gemm0Acc::cols;
|
||||
uint32_t const accC = idxAccCoreMat % Gemm0Acc::cols;
|
||||
@ -3231,6 +3428,24 @@ __device__ inline void storeRotatedPairsForQ(SharedMem::QBuffer& dst,
|
||||
}
|
||||
|
||||
#ifndef GENERATE_CUBIN
|
||||
uint32_t computeNbSubSeqPerSeqHopperF8MHA(
|
||||
cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen)
|
||||
{
|
||||
auto const env = std::getenv("XQA_NB_SUB_SEQ");
|
||||
if (env != nullptr)
|
||||
{
|
||||
int32_t const val = std::stoi(env);
|
||||
if (val > 0)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
}
|
||||
float const factor = 0.25f;
|
||||
return mha::min<uint32_t>(
|
||||
mha::max<uint32_t>(1U, (uint32_t) round(prop.multiProcessorCount * 3 / (batchSize * nbKHeads) * factor)),
|
||||
divUp(maxSeqLen, gemm0CtaTileNbTokens));
|
||||
}
|
||||
|
||||
void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
#if SLIDING_WINDOW
|
||||
uint32_t slidingWinSize,
|
||||
@ -3268,6 +3483,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
// int8/fp8 KV cache.
|
||||
#if SPEC_DEC
|
||||
SpecDecParams const& specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
float const skipSoftmaxThresholdScaleFactor,
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
|
||||
#endif
|
||||
#endif
|
||||
uint32_t* semaphores, void* scratch, cudaStream_t stream)
|
||||
{
|
||||
@ -3286,22 +3507,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
uint32_t const nbVHeads = nbKHeads;
|
||||
uint32_t const nbQHeads = nbKHeads * headGrpSize;
|
||||
uint32_t const nbQKVHeads = nbQHeads + nbKHeads + nbVHeads;
|
||||
uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t
|
||||
{
|
||||
auto const env = std::getenv("XQA_NB_SUB_SEQ");
|
||||
if (env != nullptr)
|
||||
{
|
||||
int32_t const val = std::stoi(env);
|
||||
if (val > 0)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
}
|
||||
float const factor = 0.25f;
|
||||
return mha::min<uint32_t>(
|
||||
mha::max<uint32_t>(1U, (uint32_t) round(prop.multiProcessorCount * 3 / (batchSize * nbKHeads) * factor)),
|
||||
divUp(maxSeqLen, gemm0CtaTileNbTokens));
|
||||
}();
|
||||
uint32_t const nbSubSeqPerSeq = computeNbSubSeqPerSeqHopperF8MHA(prop, batchSize, nbKHeads, maxSeqLen);
|
||||
#if SPEC_DEC
|
||||
uint32_t const qSeqLen = specDecParams.qSeqLen;
|
||||
#else
|
||||
@ -3371,6 +3577,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
|
||||
#endif
|
||||
#if SPEC_DEC
|
||||
specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
skipSoftmaxThresholdScaleFactor,
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
skippedBlockCount, totalBlockCount,
|
||||
#endif
|
||||
#endif
|
||||
semaphores, scratch);
|
||||
#else
|
||||
|
||||
@ -1272,6 +1272,19 @@ using is_void = is_same<remove_cv_t<T>, void>;
|
||||
template <typename T>
|
||||
inline constexpr bool is_void_v = is_void<T>::value;
|
||||
#endif
|
||||
|
||||
#ifndef GENERATE_CUBIN
|
||||
template <typename T1, typename T2>
|
||||
using pair = std::pair<T1, T2>;
|
||||
#else
|
||||
template <typename T1, typename T2>
|
||||
struct pair
|
||||
{
|
||||
T1 first;
|
||||
T2 second;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace mha
|
||||
|
||||
#if GENERATE_CUBIN
|
||||
|
||||
@ -50,7 +50,8 @@ using Vector = Matrix<Type, Size, 1>;
|
||||
template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
|
||||
Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
|
||||
CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
|
||||
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
|
||||
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor,
|
||||
uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum)
|
||||
{
|
||||
uint32_t const nbTiles = divUp(seqLen, tileSize);
|
||||
auto gemm1Acc = Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor>::Zero().eval();
|
||||
@ -61,6 +62,16 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
|
||||
float const qkScale = qScale * kvScale / sqrtf(validElemsPerHead);
|
||||
uint32_t const seqBeg = (seqLen < slidingWinSize ? 0 : seqLen - slidingWinSize);
|
||||
uint32_t const idxTileBeg = seqBeg / tileSize;
|
||||
|
||||
uint32_t const nbSubSeq = (multiBlockNum > 0 && nbTiles >= 2) ? mha::min(nbTiles, multiBlockNum) : 1;
|
||||
std::vector<Eigen::Vector<float, headGrpSize>> skipRowMaxs(nbSubSeq);
|
||||
for (uint32_t i = 0; i < nbSubSeq; i++)
|
||||
{
|
||||
skipRowMaxs[i].fill(-INFINITY);
|
||||
}
|
||||
bool const disableSkipForShortSeq = (seqLen < skipSoftmaxThresholdScaleFactor);
|
||||
float const skipSoftmaxThreshold = disableSkipForShortSeq ? 0.0f : skipSoftmaxThresholdScaleFactor / seqLen;
|
||||
|
||||
for (uint32_t idxTile = idxTileBeg; idxTile < nbTiles; idxTile++)
|
||||
{
|
||||
Eigen::Matrix<float, headGrpSize, tileSize, Eigen::RowMajor> gemm0Acc;
|
||||
@ -88,7 +99,22 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
|
||||
}
|
||||
}
|
||||
|
||||
Eigen::Vector<float, headGrpSize> const tileRowMax = gemm0Acc.rowwise().maxCoeff().cwiseMax(rowMax).eval();
|
||||
Eigen::Vector<float, headGrpSize> const localRowMax = gemm0Acc.rowwise().maxCoeff().eval();
|
||||
Eigen::Vector<float, headGrpSize> const tileRowMax = localRowMax.cwiseMax(rowMax).eval();
|
||||
auto const prevSkipRowMax = skipRowMaxs[idxTile % nbSubSeq];
|
||||
skipRowMaxs[idxTile % nbSubSeq] = localRowMax.cwiseMax(skipRowMaxs[idxTile % nbSubSeq]).eval();
|
||||
|
||||
if (!disableSkipForShortSeq && skipSoftmaxThreshold > 0)
|
||||
{
|
||||
*totalBlockCount += 1;
|
||||
auto const skipSoftmaxMask = ((localRowMax - prevSkipRowMax).array() < std::log(skipSoftmaxThreshold));
|
||||
bool const skipBlock = skipSoftmaxMask.all() && ((idxTile - idxTileBeg) >= nbSubSeq);
|
||||
if (skipBlock)
|
||||
{
|
||||
*skippedBlockCount += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Eigen::Matrix<float, headGrpSize, tileSize, Eigen::RowMajor> tileX
|
||||
= (gemm0Acc.colwise() - tileRowMax).array().exp().eval();
|
||||
@ -138,7 +164,8 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
|
||||
template Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> \
|
||||
refFlashAttention<prec, tileSize, isPaged, useBeamSearch>(IOHead const* q, \
|
||||
CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, \
|
||||
float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
|
||||
float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, \
|
||||
float skipSoftmaxThreshold, uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum)
|
||||
|
||||
INSTANTIATE_refFlashAttention(CacheElem, 64, false, false);
|
||||
INSTANTIATE_refFlashAttention(CacheElem, 64, false, true);
|
||||
|
||||
@ -88,7 +88,8 @@ struct CacheSeq<true, true>
|
||||
template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
|
||||
Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
|
||||
CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
|
||||
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks);
|
||||
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor,
|
||||
uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum);
|
||||
|
||||
template <typename MathElem, bool isPaged, bool useBeamSearch>
|
||||
#if SPEC_DEC
|
||||
|
||||
@ -150,7 +150,8 @@ template <uint32_t nbKHeads>
|
||||
#endif
|
||||
#endif
|
||||
void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, bool verbose = false,
|
||||
bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30)
|
||||
bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30,
|
||||
float skipSoftmaxThresholdScaleFactor = 0.0f)
|
||||
{
|
||||
#if IS_MLA
|
||||
if (nbKHeads != 1)
|
||||
@ -224,6 +225,12 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
seqLen = (16U << 20) / gmemCacheHeadBytes; // 32MB per K+V head.
|
||||
}
|
||||
ctxLen = std::min(ctxLen, seqLen);
|
||||
uint32_t skippedBlockCount = 0;
|
||||
uint32_t totalBlockCount = 0;
|
||||
if (skipSoftmaxThresholdScaleFactor > 0)
|
||||
{
|
||||
assert(useQGMMA);
|
||||
}
|
||||
float const kScale = cacheElemSize == 2 ? 1.f : 1 / 4.f;
|
||||
float const vScale = kScale;
|
||||
float const qScale = 1.f;
|
||||
@ -329,6 +336,17 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
auto const rcpOutScale = ManagedMemBuf<float>(1);
|
||||
auto const seqLenList = ManagedMemBuf<uint32_t[beamWidth]>(batchSize);
|
||||
auto const ctxLenList = ManagedMemBuf<uint32_t[beamWidth]>(batchSize);
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
#ifdef SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
auto const kernelSkippedBlockCount = ManagedMemBuf<uint32_t>(1);
|
||||
auto const kernelTotalBlockCount = ManagedMemBuf<uint32_t>(1);
|
||||
kernelSkippedBlockCount[0] = 0;
|
||||
kernelTotalBlockCount[0] = 0;
|
||||
#endif
|
||||
#else
|
||||
EXPECT_EQ(skipSoftmaxThresholdScaleFactor, 0.0f)
|
||||
<< "Got non-zero skipSoftmaxThresholdScaleFactor while SKIP_SOFTMAX_ATTN is not enabled.";
|
||||
#endif
|
||||
#if USE_PAGED_KV_CACHE
|
||||
auto const pageListBuf = ManagedMemBuf<std::byte>(pageListBytes);
|
||||
#if PAGED_KV_CACHE_LAYOUT == 1
|
||||
@ -726,6 +744,11 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
maxSeqLen, &seqLenList[0][0], batchSize, kvCacheScale.get(), semaphores.get(), scratch, stream);
|
||||
};
|
||||
#else
|
||||
auto multiBlockNum = [&]()
|
||||
{
|
||||
auto const calcFunc = useQGMMA ? &computeNbSubSeqPerSeqHopperF8MHA : &computeNbSubSeqPerSeqMHA;
|
||||
return calcFunc(prop, batchSize, nbKHeads, maxSeqLen);
|
||||
}();
|
||||
auto runKernel = [&]()
|
||||
{
|
||||
auto const launchFunc = useQGMMA ? &launchHopperF8MHA : &launchMHA;
|
||||
@ -776,6 +799,12 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
batchSize, kvCacheScale.get(),
|
||||
#if SPEC_DEC
|
||||
specDecParams,
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
skipSoftmaxThresholdScaleFactor,
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
kernelSkippedBlockCount.get(), kernelTotalBlockCount.get(),
|
||||
#endif
|
||||
#endif
|
||||
semaphores.get(), scratch, stream);
|
||||
checkCuda(cudaGetLastError());
|
||||
@ -813,6 +842,10 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
checkCuda(cudaEventRecord(toc, stream));
|
||||
prefetchToDevice(cudaCpuDeviceId);
|
||||
checkCuda(cudaStreamSynchronize(stream));
|
||||
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
kernelSkippedBlockCount[0] /= nbIters;
|
||||
kernelTotalBlockCount[0] /= nbIters;
|
||||
#endif
|
||||
if (testPerf)
|
||||
{
|
||||
float ms;
|
||||
@ -849,6 +882,15 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
= totalNbCacheLoadBytes + inputBytes + outputBytes; // we ignore page indices and beam search indices.
|
||||
float const dramSolTime = totalTraffic / bandwidth * 1E3f;
|
||||
float const dramSolRatio = dramSolTime / ms;
|
||||
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
size_t const totalNbCacheLoadWithSkip = gmemCacheHeadBytes
|
||||
* (nbKHeads + nbVHeads * (1 - 1.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]))
|
||||
* nbLoadedCacheTokens;
|
||||
float const totalTrafficWithSkip
|
||||
= totalNbCacheLoadWithSkip + inputBytes + outputBytes; // we ignore page indices and beam search indices.
|
||||
float const dramSolTimeWithSkip = totalTrafficWithSkip / bandwidth * 1E3f;
|
||||
float const dramSolRatioWithSkip = dramSolTimeWithSkip / ms;
|
||||
#endif
|
||||
if (verbose)
|
||||
{
|
||||
printf("done\n");
|
||||
@ -863,7 +905,13 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
}
|
||||
float const tops = headGrpSize * qSeqLen * float(seqLen) * (validElemsPerKHead + validElemsPerVHead) * 2
|
||||
* nbKHeads * batchSize / (ms * 1E-3F) * 1E-12F;
|
||||
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0],
|
||||
kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]);
|
||||
printf("dramSolRatioWithSkip: %f%% (%f ms, TOPS = %f)\n", dramSolRatioWithSkip * 100, ms, tops);
|
||||
#else
|
||||
printf("dramSolRatio: %f%% (%f ms, TOPS = %f)\n", dramSolRatio * 100, ms, tops);
|
||||
#endif
|
||||
}
|
||||
if (refCheck)
|
||||
{
|
||||
@ -1084,8 +1132,8 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
if (useQGMMA)
|
||||
{
|
||||
refOutput = refFlashAttention<CacheElem, 64>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
|
||||
vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize,
|
||||
refAttentionSinks);
|
||||
vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, refAttentionSinks,
|
||||
skipSoftmaxThresholdScaleFactor, &skippedBlockCount, &totalBlockCount, multiBlockNum);
|
||||
// refOutput = refAttention<CacheElem>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
|
||||
// vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize);
|
||||
}
|
||||
@ -1132,6 +1180,14 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
printf("host skippedBlockCount: %d/%d (%.2f%%)\n", skippedBlockCount, totalBlockCount,
|
||||
totalBlockCount == 0 ? 0.0f : 100.0f * skippedBlockCount / totalBlockCount);
|
||||
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
|
||||
printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0],
|
||||
kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]);
|
||||
#endif
|
||||
#endif
|
||||
if (saveData)
|
||||
{
|
||||
fout_refOutput.close();
|
||||
@ -1253,6 +1309,14 @@ TEST(RefCheck, llama_V2_70b)
|
||||
#if SLIDING_WINDOW
|
||||
runTest<2>(2, 4096, false, true, false, false, false, ~0, 256);
|
||||
runTest<2>(2, 400, false, true, false, false, false, ~0U, 256);
|
||||
#endif
|
||||
#if SKIP_SOFTMAX_ATTN
|
||||
runTest<1>(32, 2048, false, true, false, false, false, ~0U, 1U << 30, 0.f);
|
||||
runTest<4>(32, 1538, false, true, false, false, false, ~0U, 1U << 30, 1280.f);
|
||||
runTest<2>(32, 4096, false, true, false, false, false, ~0U, 1U << 30, 125.f);
|
||||
runTest<4>(32, 300, false, true, false, false, false, ~0U, 1U << 30, 80.f);
|
||||
runTest<4>(32, 500, false, true, false, false, false, ~0U, 1U << 30, 501.0f);
|
||||
runTest<4>(32, 500, false, true, false, false, false, ~0U, 1U << 30, 500.f);
|
||||
#endif
|
||||
runTest<8>(120, 367, false, true);
|
||||
runTest<8>(1792, 2048, false, true);
|
||||
|
||||
@ -1556,7 +1556,7 @@ void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAm
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(
|
||||
std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(
|
||||
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds, bool pinBlocks)
|
||||
{
|
||||
SizeType32 numBlocksStoredForReuse = 0;
|
||||
@ -1569,7 +1569,7 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s
|
||||
|
||||
auto numBlocks = blockKeys.size();
|
||||
std::vector<BlockPtr> storedBlocks;
|
||||
std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
|
||||
std::vector<KVCacheBlock::IdType> pinnedBlockIds;
|
||||
for (std::size_t blockCnt = 0; blockCnt < numBlocks; ++blockCnt)
|
||||
{
|
||||
auto const bid = blockIds[blockCnt];
|
||||
@ -1620,14 +1620,14 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s
|
||||
if (pinBlocks)
|
||||
{
|
||||
searchRoot->incRefCount();
|
||||
pinnedBlockIds.push_back(searchRoot->getBlockId());
|
||||
}
|
||||
lastStoredId = searchRoot->getBlockId();
|
||||
}
|
||||
if (mEventManager)
|
||||
{
|
||||
mEventManager->enqueueStoredEvent(storedBlocks, mWindowSize);
|
||||
}
|
||||
return {numBlocksStoredForReuse, lastStoredId};
|
||||
return {numBlocksStoredForReuse, pinnedBlockIds};
|
||||
}
|
||||
|
||||
void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx)
|
||||
@ -1715,15 +1715,15 @@ std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::c
|
||||
return mEventManager ? mEventManager->getEvents(timeout) : std::deque<tle::KVCacheEvent>{};
|
||||
}
|
||||
|
||||
std::optional<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(
|
||||
std::vector<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(
|
||||
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
|
||||
{
|
||||
std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
|
||||
std::vector<KVCacheBlock::IdType> pinnedBlockIds;
|
||||
for (auto& [_, manager] : mWindowBlockManagers)
|
||||
{
|
||||
lastStoredId = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
|
||||
pinnedBlockIds = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
|
||||
}
|
||||
return lastStoredId;
|
||||
return pinnedBlockIds;
|
||||
}
|
||||
|
||||
std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
|
||||
@ -1767,7 +1767,7 @@ void BlockManager::pinBlocks(GenerationRequest& sequence)
|
||||
}
|
||||
}
|
||||
|
||||
void BlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
|
||||
void BlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
|
||||
{
|
||||
// Use the first window size
|
||||
if (mWindowBlockManagers.empty())
|
||||
@ -1775,7 +1775,7 @@ void BlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
|
||||
return;
|
||||
}
|
||||
auto& firstManager = mWindowBlockManagers.begin()->second;
|
||||
firstManager.unpinBlocksById(blockId);
|
||||
firstManager.unpinBlocksById(blockIds);
|
||||
}
|
||||
|
||||
void WindowBlockManager::pinBlocks(GenerationRequest& sequence)
|
||||
@ -1788,21 +1788,26 @@ void WindowBlockManager::pinBlocks(GenerationRequest& sequence)
|
||||
}
|
||||
}
|
||||
|
||||
void WindowBlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
|
||||
void WindowBlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
|
||||
{
|
||||
if (blockId < 0 || static_cast<size_t>(blockId) >= mAllBlocksById.size())
|
||||
if (blockIds.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto block = mAllBlocksById[blockId];
|
||||
while (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
|
||||
|
||||
for (auto const& blockId : blockIds)
|
||||
{
|
||||
block->decRefCount();
|
||||
if (!block->hasRefs())
|
||||
TLLM_CHECK_WITH_INFO(blockId >= 0 && static_cast<size_t>(blockId) < mAllBlocksById.size(),
|
||||
"Block id %d is out of range", blockId);
|
||||
auto block = mAllBlocksById[blockId];
|
||||
if (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
|
||||
{
|
||||
mEvictionPolicy->releaseBlock(block);
|
||||
block->decRefCount();
|
||||
if (!block->hasRefs())
|
||||
{
|
||||
mEvictionPolicy->releaseBlock(block);
|
||||
}
|
||||
}
|
||||
block = std::move(block->getPrevBlock());
|
||||
}
|
||||
}
|
||||
|
||||
@ -1870,7 +1875,7 @@ void WindowBlockManager::storeNewBlock(GenerationRequest& sequence, OptionalRef<
|
||||
(void) storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
|
||||
}
|
||||
|
||||
std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
|
||||
std::vector<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
|
||||
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
|
||||
{
|
||||
auto constexpr beamIdx = 0;
|
||||
@ -1883,7 +1888,10 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
|
||||
auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
|
||||
auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);
|
||||
auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
|
||||
return storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks).second;
|
||||
|
||||
auto [numStored, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks);
|
||||
|
||||
return pinnedBlockIds;
|
||||
}
|
||||
|
||||
std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
|
||||
@ -1922,7 +1930,7 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
|
||||
std::transform(allocatedBlocks.begin(), allocatedBlocks.end(), cacheBlockIds.begin(),
|
||||
[](BlockPtr const& block) { return block->getBlockId(); });
|
||||
|
||||
auto [numBlocksStoredForReuse, lastStoredId] = storeBlocks(std::move(blockKeys), cacheBlockIds);
|
||||
auto [numBlocksStoredForReuse, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds);
|
||||
TLLM_LOG_DEBUG("%s::releaseBlocks Request %lu, %d blocks stored for reuse", mLogPrefix.c_str(),
|
||||
sequence.getRequestId(), numBlocksStoredForReuse);
|
||||
}
|
||||
@ -2499,15 +2507,14 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
|
||||
return lastStoredId;
|
||||
}
|
||||
|
||||
std::optional<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
|
||||
std::vector<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
|
||||
RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
|
||||
{
|
||||
TLLM_LOG_TRACE("[%s]::%s start", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
|
||||
auto& sequence = getSequence(requestId);
|
||||
std::optional<KVCacheBlock::IdType> lastStoredId
|
||||
= mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
|
||||
auto pinnedBlockIds = mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
|
||||
TLLM_LOG_TRACE("[%s]::%s stop", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
|
||||
return lastStoredId;
|
||||
return pinnedBlockIds;
|
||||
}
|
||||
|
||||
void KVCacheManager::schedulingRemoveSequence(RequestIdType requestId)
|
||||
@ -2522,9 +2529,9 @@ void KVCacheManager::pinBlocks(RequestIdType requestId)
|
||||
mBlockManager.pinBlocks(sequence);
|
||||
}
|
||||
|
||||
void KVCacheManager::unpinBlocksById(KVCacheBlock::IdType blockId)
|
||||
void KVCacheManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
|
||||
{
|
||||
mBlockManager.unpinBlocksById(blockId);
|
||||
mBlockManager.unpinBlocksById(blockIds);
|
||||
}
|
||||
|
||||
SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSlotOffset, RequestIdType requestId) const
|
||||
|
||||
@ -298,6 +298,11 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
|
||||
xqaParams.use_sparse_attention = useTllmGenSparseAttention();
|
||||
// Skip softmax threshold.
|
||||
xqaParams.skip_softmax_threshold_scale_factor = mSkipSoftmaxThresholdScaleFactorDecode;
|
||||
#ifdef SKIP_SOFTMAX_STAT
|
||||
// Statistics of skip-softmax, pointers of device memory for output
|
||||
xqaParams.skip_softmax_total_blocks = mSkipSoftmaxTotalBlocks;
|
||||
xqaParams.skip_softmax_skipped_blocks = mSkipSoftmaxSkippedBlocks;
|
||||
#endif
|
||||
// Cross attention parameters.
|
||||
xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
|
||||
|
||||
|
||||
@ -2179,11 +2179,11 @@ void Executor::Impl::terminateContextFinishedRequests(InTransList& inTransmissio
|
||||
auto req = item.request;
|
||||
if (req->isDisaggContextCompleteState())
|
||||
{
|
||||
// If lastBlockId was tracked, unpin it. Otherwise, just terminate.
|
||||
// If pinnedBlockIds were tracked, unpin them. Otherwise, just terminate.
|
||||
auto kvMgr = mModel->getKVCacheManager();
|
||||
if (kvMgr && item.lastBlockId.has_value())
|
||||
if (kvMgr && !item.pinnedBlockIds.empty())
|
||||
{
|
||||
kvMgr->unpinBlocksById(item.lastBlockId.value());
|
||||
kvMgr->unpinBlocksById(item.pinnedBlockIds);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -2234,14 +2234,14 @@ Executor::Impl::RequestList Executor::Impl::populateNewResponses(
|
||||
// move the in transmission requests to another tracker
|
||||
if (llmReq->isDisaggContextTransmissionState())
|
||||
{
|
||||
std::optional<SizeType32> lastBlockId{};
|
||||
std::vector<SizeType32> pinnedBlockIds{};
|
||||
auto kvMgr = mModel->getKVCacheManager();
|
||||
if (kvMgr && kvMgr->isEnableBlockReuse() && !kvMgr->getBlockManager().isVariableWindow())
|
||||
{
|
||||
lastBlockId = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /*pinBlocks=*/true);
|
||||
pinnedBlockIds = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /*pinBlocks=*/true);
|
||||
mModel->terminateRequest(llmReq);
|
||||
}
|
||||
inTransmissionRequests.push_back(InTransmissionItem{*it, lastBlockId});
|
||||
inTransmissionRequests.push_back(InTransmissionItem{*it, pinnedBlockIds});
|
||||
}
|
||||
finishedRequests.push_back(*it);
|
||||
it = activeRequests.erase(it);
|
||||
|
||||
@ -80,12 +80,12 @@ class Executor::Impl
|
||||
using RequestList = std::list<LlmRequestPtr>;
|
||||
|
||||
// When block reuse is enabled for context worker for disaggregated serving,
|
||||
// we need to store the last block id so that we can unpin the block when
|
||||
// we need to store the pinned block ids so that we can unpin them when
|
||||
// the request is finished.
|
||||
struct InTransmissionItem
|
||||
{
|
||||
LlmRequestPtr request;
|
||||
std::optional<SizeType32> lastBlockId;
|
||||
std::vector<SizeType32> pinnedBlockIds;
|
||||
};
|
||||
|
||||
using InTransList = std::list<InTransmissionItem>;
|
||||
|
||||
@ -105,7 +105,8 @@ CubinObj CompileEngine::compile() const
|
||||
// scratch in this case.
|
||||
/*use_input_kv=*/applyRoPEInXqaKernel,
|
||||
/*rope_style=*/ropeStyle,
|
||||
/*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree};
|
||||
/*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree,
|
||||
/*use_skip_softmax_attn=*/mXqaParams.skip_softmax_threshold_scale_factor != 0};
|
||||
if (context.kernel_type == TLLM_XQA_JIT_MLA)
|
||||
{
|
||||
auto const& c = context;
|
||||
|
||||
@ -232,6 +232,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
|
||||
jit::CubinObj const* const cubinObj = mResource->getCubinObjRegistry()->getCubin(key);
|
||||
TLLM_CHECK(cubinObj != nullptr && cubinObj->isInitialized());
|
||||
bool const isSpecDec = xqaParams.multi_query_tokens;
|
||||
bool const isSkipSoftmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
|
||||
bool const isHMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kAMPERE_WARP_SPECIALIZED);
|
||||
bool const isGMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kHOPPER_WARP_SPECIALIZED);
|
||||
bool const isMLAKernel = (cubinObj->getKernelType() == XQAKernelType::kSM120_MLA);
|
||||
@ -378,7 +379,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
|
||||
.mask = reinterpret_cast<SpecDecParams::MaskType const*>(xqaParams.spec_decoding_packed_mask)};
|
||||
};
|
||||
|
||||
constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 16;
|
||||
constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 19;
|
||||
uint32_t idxNextParam = 0;
|
||||
void* kernelParams[kMAX_NB_KERNEL_PARAMS];
|
||||
auto appendParam = [&](auto* p) mutable
|
||||
@ -514,6 +515,16 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
|
||||
appendParam(&specDecParams);
|
||||
specDecBlocks = divUp(specDecParams.qSeqLen, 64 / num_q_heads_over_kv);
|
||||
}
|
||||
if (isSkipSoftmax)
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(isGMMAKernel, "skip softmax is only supported for GMMA kernel for now.");
|
||||
TLLM_CHECK_WITH_INFO(!isSpecDec, "skip softmax is not supported with spec dec for now.");
|
||||
appendParam(&xqaParams.skip_softmax_threshold_scale_factor);
|
||||
#ifdef SKIP_SOFTMAX_STAT
|
||||
appendParam(&xqaParams.skip_softmax_total_blocks);
|
||||
appendParam(&xqaParams.skip_softmax_skipped_blocks);
|
||||
#endif
|
||||
}
|
||||
appendParam(&launchParams.semaphores);
|
||||
appendParam(&launchParams.scratch);
|
||||
kernelParams[idxNextParam] = nullptr; // one extra nullptr at end as guard.
|
||||
|
||||
@ -96,10 +96,16 @@ bool supportConfigQGMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlu
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)
|
||||
if (!contains({DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_E4M3}, xqaParams.kv_cache_data_type))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
|
||||
if (!is_skip_softmax && xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)
|
||||
{
|
||||
// Only use hopper kernel with fp16/bf16 kv cache data type when skip softmax is enabled
|
||||
return false;
|
||||
}
|
||||
if (xqaParams.beam_width != 1)
|
||||
{
|
||||
return false;
|
||||
@ -168,6 +174,11 @@ bool supportConfigHMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlug
|
||||
{
|
||||
return false;
|
||||
}
|
||||
bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
|
||||
if (is_skip_softmax)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -201,6 +212,11 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi
|
||||
{
|
||||
return false;
|
||||
}
|
||||
bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
|
||||
if (is_skip_softmax)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -66,6 +66,7 @@ extern "C"
|
||||
|
||||
bool is_spec_dec_tree
|
||||
= true; // useful only when multi_query_tokens, should be true unless using linear tree in spec-dec.
|
||||
bool use_skip_softmax_attn;
|
||||
} tllmXqaJitContext;
|
||||
|
||||
// tllmXqaJitProgram is an opaque handle for a program.
|
||||
|
||||
@ -215,6 +215,10 @@ tllmXqaJitStatus getMacroFlags(tllmXqaJitContext const* context, std::vector<std
|
||||
macros["USE_INPUT_KV"] = context->use_input_kv ? "1" : "0";
|
||||
macros["ROPE_STYLE"] = std::to_string(int(context->rope_style));
|
||||
macros["IS_SPEC_DEC_TREE"] = context->is_spec_dec_tree ? "1" : "0";
|
||||
macros["SKIP_SOFTMAX_ATTN"] = context->use_skip_softmax_attn ? "1" : "0";
|
||||
#ifdef SKIP_SOFTMAX_STAT
|
||||
macros["SKIP_SOFTMAX_ATTN_BLOCK_STATS"] = context->use_skip_softmax_attn ? "1" : "0";
|
||||
#endif
|
||||
|
||||
// Without these macros, NVRTC uses precompiled headers for cuda_fp16.h etc.
|
||||
// Linking might fail due to ABI incompatibility.
|
||||
|
||||
@ -493,6 +493,10 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo
|
||||
{
|
||||
SUPPORT_RETURN_FALSE("streaming-llm");
|
||||
}
|
||||
if (xqaParams.skip_softmax_threshold_scale_factor != 0)
|
||||
{
|
||||
SUPPORT_RETURN_FALSE("skip_softmax_threshold_scale_factor");
|
||||
}
|
||||
|
||||
// OPTIMIZE: For the standard generation-phase MHA, there are still extra limitations.
|
||||
// NOTE: Medusa mode = Multi_query_tokens > 1.
|
||||
|
||||
@ -64,6 +64,21 @@ CUtensorMapSwizzle getSwizzleMode(uint32_t partBytes)
|
||||
}
|
||||
};
|
||||
|
||||
CUtensorMapDataType_enum getDataTypeFromXqaParams(XQAParams const& xqaParams)
|
||||
{
|
||||
if (xqaParams.kv_cache_data_type == DATA_TYPE_BF16)
|
||||
{
|
||||
return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
|
||||
}
|
||||
else if (xqaParams.kv_cache_data_type == DATA_TYPE_FP16)
|
||||
{
|
||||
return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
|
||||
}
|
||||
TLLM_CHECK(xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 || xqaParams.kv_cache_data_type == DATA_TYPE_E5M2
|
||||
|| xqaParams.kv_cache_data_type == DATA_TYPE_INT8);
|
||||
return CU_TENSOR_MAP_DATA_TYPE_UINT8;
|
||||
}
|
||||
|
||||
CUtensorMap makeTensorMapForQ(std::shared_ptr<CUDADriverWrapper> const& driver, void const* addr,
|
||||
CUtensorMapDataType_enum dataType, uint32_t headElems, uint32_t totalNbHeads, uint32_t partElems, uint32_t boxHeads)
|
||||
{
|
||||
@ -131,24 +146,26 @@ CUtensorMap makeTensorMapForHopperXqaKVCache(
|
||||
if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
|
||||
{
|
||||
uint32_t const headElems = xqaParams.head_size;
|
||||
uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8);
|
||||
CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
|
||||
uint32_t const elemBytes = getElemBytes(dataType);
|
||||
TLLM_CHECK(headElems <= 256);
|
||||
uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256);
|
||||
uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes;
|
||||
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8,
|
||||
xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
|
||||
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size,
|
||||
xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(std::is_same_v<KVCacheBuffer, KVLinearBuffer>);
|
||||
uint32_t const headElems = xqaParams.head_size;
|
||||
uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8);
|
||||
CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
|
||||
uint32_t const elemBytes = getElemBytes(dataType);
|
||||
TLLM_CHECK(headElems <= 256);
|
||||
uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256);
|
||||
uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes;
|
||||
return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, CU_TENSOR_MAP_DATA_TYPE_UINT8,
|
||||
xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width,
|
||||
xqaParams.batch_size, partElems);
|
||||
return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, dataType, xqaParams.head_size,
|
||||
xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width, xqaParams.batch_size,
|
||||
partElems);
|
||||
}
|
||||
}
|
||||
|
||||
@ -161,11 +178,12 @@ template <typename KVCacheBuffer>
|
||||
CUtensorMap makeTensorMapForXqaMlaKVCache(std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver,
|
||||
XQAParams const& xqaParams, KVCacheBuffer const& kv_cache_buffer, bool forK)
|
||||
{
|
||||
CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
|
||||
uint32_t const partElems = (forK ? 64 : 128);
|
||||
if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
|
||||
{
|
||||
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8,
|
||||
xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
|
||||
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size,
|
||||
xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -183,7 +201,7 @@ CUtensorMap makeTensorMapForXqaMlaQ(
|
||||
std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver, XQAParams const& xqaParams, void const* q)
|
||||
{
|
||||
uint32_t const partElems = 64;
|
||||
return makeTensorMapForQ(driver, q, CU_TENSOR_MAP_DATA_TYPE_UINT8, xqaParams.head_size,
|
||||
return makeTensorMapForQ(driver, q, getDataTypeFromXqaParams(xqaParams), xqaParams.head_size,
|
||||
xqaParams.num_q_heads * xqaParams.total_num_input_tokens, partElems, xqaParams.num_q_heads);
|
||||
}
|
||||
} // namespace kernels
|
||||
|
||||
@ -119,7 +119,12 @@ struct XQAParams
|
||||
bool use_sparse_attention = false;
|
||||
|
||||
// Skip softmax threshold.
|
||||
float skip_softmax_threshold_scale_factor = 0.0f;
|
||||
float skip_softmax_threshold_scale_factor = 0;
|
||||
|
||||
#ifdef SKIP_SOFTMAX_STAT
|
||||
uint32_t* skip_softmax_total_blocks = nullptr;
|
||||
uint32_t* skip_softmax_skipped_blocks = nullptr;
|
||||
#endif
|
||||
|
||||
cudaStream_t stream = 0;
|
||||
// layer index
|
||||
@ -199,6 +204,10 @@ struct XQAParams
|
||||
<< "sparse_params: " << sparse_params.toString() << std::endl
|
||||
<< "use_sparse_attention :" << (use_sparse_attention ? "true" : "false") << std ::endl
|
||||
<< "skip_softmax_threshold_scale_factor :" << skip_softmax_threshold_scale_factor << std ::endl
|
||||
#ifdef SKIP_SOFTMAX_STAT
|
||||
<< "skip_softmax_total_blocks :" << skip_softmax_total_blocks << std ::endl
|
||||
<< "skip_softmax_skipped_blocks :" << skip_softmax_skipped_blocks << std ::endl
|
||||
#endif
|
||||
<< "stream :" << stream;
|
||||
|
||||
return ss.str();
|
||||
|
||||
@ -161,6 +161,7 @@ void initBindings(nb::module_& m)
|
||||
.def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
|
||||
.def_prop_ro("is_finished", &GenLlmReq::isFinished)
|
||||
.def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
|
||||
.def_prop_ro("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
|
||||
.def_prop_rw(
|
||||
"context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
|
||||
.def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
|
||||
|
||||
@ -123,7 +123,7 @@ public:
|
||||
NB_OVERRIDE_PURE(removeSequence, requestId, llmRequest, pinOnRelease);
|
||||
}
|
||||
|
||||
std::optional<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
|
||||
std::vector<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
|
||||
tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest, bool pinBlocks) override
|
||||
{
|
||||
NB_OVERRIDE_PURE(storeBlocksForReuse, requestId, llmRequest, pinBlocks);
|
||||
|
||||
@ -165,6 +165,7 @@ void initBindings(pybind11::module_& m)
|
||||
.def("set_finished_reason", &GenLlmReq::setFinishedReason, py::arg("finish_reason"), py::arg("beam"))
|
||||
.def_property_readonly("is_finished", &GenLlmReq::isFinished)
|
||||
.def_property_readonly("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
|
||||
.def_property_readonly("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
|
||||
.def_property(
|
||||
"context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
|
||||
.def_property_readonly("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
|
||||
|
||||
@ -111,10 +111,10 @@ public:
|
||||
requestId, llmRequest, pinOnRelease);
|
||||
}
|
||||
|
||||
std::optional<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
|
||||
std::vector<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
|
||||
tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest, bool pinBlocks) override
|
||||
{
|
||||
PYBIND11_OVERLOAD_PURE(std::optional<tbk::KVCacheBlock::IdType>, tbk::BaseKVCacheManager, storeBlocksForReuse,
|
||||
PYBIND11_OVERLOAD_PURE(std::vector<tbk::KVCacheBlock::IdType>, tbk::BaseKVCacheManager, storeBlocksForReuse,
|
||||
requestId, llmRequest, pinBlocks);
|
||||
}
|
||||
|
||||
|
||||
@ -4066,11 +4066,13 @@ TEST_F(KVCacheManagerTest, PinAndUnpinBlocksById)
|
||||
kvCacheManager.pinBlocks(requestId);
|
||||
auto lastBlockIdOpt = kvCacheManager.getLastBlockId(requestId);
|
||||
ASSERT_TRUE(lastBlockIdOpt.has_value());
|
||||
auto const& allBlockIds = kvCacheManager.getCacheBlockIds(requestId, maxAttentionWindow)[0];
|
||||
std::vector<SizeType32> pinnedBlockIds(allBlockIds.begin(), allBlockIds.end());
|
||||
(void) kvCacheManager.removeSequence(requestId, llmRequest);
|
||||
auto const freeAfterRemovePinned = kvCacheManager.getNumFreeBlocks();
|
||||
EXPECT_LT(freeAfterRemovePinned, totalBlocks);
|
||||
|
||||
kvCacheManager.unpinBlocksById(lastBlockIdOpt.value());
|
||||
kvCacheManager.unpinBlocksById(pinnedBlockIds);
|
||||
auto const freeAfterUnpin = kvCacheManager.getNumFreeBlocks();
|
||||
EXPECT_EQ(freeAfterUnpin, totalBlocks);
|
||||
}
|
||||
|
||||
@ -227,3 +227,7 @@ Run `bench.sh` to begin a serving benchmark. This will take a long time if you r
|
||||
```shell
|
||||
./bench.sh
|
||||
```
|
||||
|
||||
## Known Issues
|
||||
|
||||
Qwen3-Next-80B-A3B exhibits relatively low accuracy on the SciCode-AA-v2 benchmark.
|
||||
|
||||
@ -38,13 +38,14 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
|
||||
| `DeepseekV3ForCausalLM` | Yes | Yes | Yes | Yes | Yes [^1] | Yes | No | No | Yes | Yes | Yes [^2] | N/A | Yes | Yes |
|
||||
| `DeepseekV32ForCausalLM` | Yes | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | N/A | Yes | Yes |
|
||||
| `Qwen3MoeForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | N/A | Yes | Yes |
|
||||
| `Qwen3NextForCausalLM` | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested |
|
||||
| `Qwen3NextForCausalLM` [^3] | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested |
|
||||
| `Llama4ForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Untested | N/A | Yes | Yes |
|
||||
| `GptOssForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes [^3] | Yes | Yes | Yes | N/A | Yes | Yes |
|
||||
| `GptOssForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes [^4] | Yes | Yes | Yes | N/A | Yes | Yes |
|
||||
|
||||
[^1]: Chunked Prefill for MLA can only be enabled on SM100/SM103.
|
||||
[^2]: KV cache reuse for MLA can only be enabled on SM90/SM100/SM103 and in BF16/FP8 KV cache dtype.
|
||||
[^3]: Overlap scheduler isn't supported when using EAGLE-3(Two Model Engine) for GPT-OSS.
|
||||
[^3]: Qwen3-Next-80B-A3B exhibits relatively low accuracy on the SciCode-AA-v2 benchmark.
|
||||
[^4]: Overlap scheduler isn't supported when using EAGLE-3(Two Model Engine) for GPT-OSS.
|
||||
|
||||
|
||||
# Multimodal Feature Support Matrix (PyTorch Backend)
|
||||
|
||||
@ -0,0 +1 @@
|
||||
attn_backend: triton
|
||||
@ -65,7 +65,7 @@ models:
|
||||
- name: bigcode/starcoder2-7b
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
- name: bigcode/starcoder2-15b-instruct-v0.1
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'attn_backend_triton.yaml']
|
||||
- name: deepseek-ai/DeepSeek-Prover-V1.5-SFT
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'compile_backend_torch_cudagraph.yaml']
|
||||
- name: deepseek-ai/DeepSeek-Prover-V2-7B
|
||||
@ -118,8 +118,6 @@ models:
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
|
||||
- name: google/gemma-3-27b-it
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
|
||||
- name: google/gemma-3-2b-it
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
- name: deepseek-ai/DeepSeek-V2.5
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
# DISABLED: Network timeout downloading from Hugging Face
|
||||
@ -145,8 +143,6 @@ models:
|
||||
# DISABLED: Graph transformation error in auto-deploy
|
||||
# - name: neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8
|
||||
# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: TheBloke/falcon-40b-instruct-GPTQ
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: Qwen/QwQ-32B
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'compile_backend_torch_cudagraph.yaml']
|
||||
- name: google/gemma-2-27b-it
|
||||
@ -159,7 +155,7 @@ models:
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: Qwen/QwQ-32B-Preview
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'compile_backend_torch_cudagraph.yaml']
|
||||
- name: Qwen/Qwen3-Coder-32B-Instruct
|
||||
- name: Qwen/Qwen3-Coder-30B-A3B-Instruct
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: Qwen/Qwen3-235B-A22B-Instruct-2507
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
@ -222,3 +218,5 @@ models:
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_scout.yaml']
|
||||
- name: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_maverick_lite.yaml']
|
||||
- name: nvidia/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-010726
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml','super_v3.yaml']
|
||||
|
||||
@ -1,9 +1,6 @@
|
||||
# EXAONE
|
||||
|
||||
This document shows how to build and run a [EXAONE](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) model in TensorRT-LLM.
|
||||
|
||||
The TensorRT LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
|
||||
See the LLaMA example [`examples/models/core/llama`](../llama) for details.
|
||||
This document shows how to build and run [EXAONE](https://huggingface.co/LGAI-EXAONE) models in TensorRT-LLM.
|
||||
|
||||
- [EXAONE](#exaone)
|
||||
- [Support Matrix](#support-matrix)
|
||||
@ -11,31 +8,51 @@ See the LLaMA example [`examples/models/core/llama`](../llama) for details.
|
||||
- [EXAONE-3.0](#exaone-30)
|
||||
- [EXAONE-Deep](#exaone-deep)
|
||||
- [EXAONE-4.0](#exaone-40)
|
||||
- [Usage](#usage)
|
||||
- [PyTorch flow](#pytorch-flow)
|
||||
-[PyTorch flow Quantization](#pytorch-flow-quantization)
|
||||
- [TRT Flow](#trt-flow)
|
||||
- [K-EXAONE](#k-exaone)
|
||||
- [PyTorch flow](#pytorch-flow)
|
||||
- [Running EXAONE-4.0](#running-exaone-40)
|
||||
- [Running K-EXAONE](#running-k-exaone)
|
||||
- [MoE Backend Options](#moe-backend-options)
|
||||
- [PyTorch flow Quantization](#pytorch-flow-quantization)
|
||||
- [FP8 Quantization](#fp8-quantization)
|
||||
- [NVFP4 Quantization](#nvfp4-quantization)
|
||||
- [Running the TensorRT LLM Server](#running-the-tensorrt-llm-server)
|
||||
- [Running Aggregated TensorRT LLM Server](#running-aggregated-tensorrt-llm-server)
|
||||
- [Creating the Extra Options Configuration](#creating-the-extra-options-configuration)
|
||||
- [Launch trtllm-serve OpenAI-compatible API server](#launch-trtllm-serve-openai-compatible-api-server)
|
||||
- [Running Disaggregated TensorRT LLM Server](#running-disaggregated-tensorrt-llm-server)
|
||||
- [Step 1: Set Environment Variables](#step-1-set-environment-variables)
|
||||
- [Step 2: Create Configuration Files](#step-2-create-configuration-files)
|
||||
- [Step 3: Launch the Disaggregated Server](#step-3-launch-the-disaggregated-server)
|
||||
- [TRT flow](#trt-flow)
|
||||
- [Convert checkpoint and build TensorRT engine(s)](#convert-checkpoint-and-build-tensorrt-engines)
|
||||
- [FP8 Post-Training Quantization](#fp8-post-training-quantization)
|
||||
- [SmoothQuant](#smoothquant)
|
||||
- [Groupwise quantization (AWQ)](#groupwise-quantization-awq)
|
||||
- [W4A16 AWQ with FP8 GEMM (W4A8 AWQ)](#w4a16-awq-with-fp8-gemm-w4a8-awq)
|
||||
- [W4A16 AWQ with FP8 GEMM (W4A8 AWQ)](#w4a16-awq-with-fp8-gemm-w4a8-awq)
|
||||
- [Run Engine](#run-engine)
|
||||
- [Troubleshootings](#troubleshootings)
|
||||
- [Troubleshootings for EXAONE-4.0](#troubleshootings-for-exaone-40)
|
||||
- [Troubleshootings for K-EXAONE](#troubleshootings-for-k-exaone)
|
||||
|
||||
## Support Matrix
|
||||
* FP16
|
||||
* BF16
|
||||
* Tensor Parallel
|
||||
* Tensor Parallel (TP)
|
||||
* Expert Parallel (EP) (K-EXAONE only)
|
||||
* Attention Data Parallel (ADP) (K-EXAONE only)
|
||||
* Disaggregated Serving
|
||||
* FP8
|
||||
* INT8 & INT4 Weight-Only
|
||||
* INT8 SmoothQuant
|
||||
* INT4 AWQ & W4A8 AWQ
|
||||
* NVFP4 (K-EXAONE only)
|
||||
|
||||
## Supported Models
|
||||
|
||||
**Note:**
|
||||
- **EXAONE-3.0** and **EXAONE-Deep** are supported using the [TRT Flow](#trt-flow).
|
||||
- **EXAONE-4.0** is supported using the [PyTorch flow](#pytorch-flow).
|
||||
**Note:**
|
||||
- **EXAONE-3.0** & **EXAONE-Deep** are supported using the [TRT Flow](#trt-flow).
|
||||
- **EXAONE-4.0** & **K-EXAONE** are supported using the [PyTorch flow](#pytorch-flow).
|
||||
|
||||
Please refer to the corresponding sections below for usage instructions and examples for each model.
|
||||
|
||||
@ -59,23 +76,33 @@ git clone https://huggingface.co/LGAI-EXAONE/EXAONE-Deep-2.4B $HF_MODEL_DIR
|
||||
|
||||
### EXAONE-4.0
|
||||
|
||||
Download he HuggingFace checkpoints of EXAONE-4.0 model. Here, we only use the `EXAONE-4.0-32B` model for the example. From EXAONE-4.0 model, we support only on PyTorch flow.
|
||||
Download the HuggingFace checkpoints of the EXAONE-4.0 model. Here, we use the `EXAONE-4.0-32B` model as an example. EXAONE-4.0 is supported only via the PyTorch flow.
|
||||
|
||||
```bash
|
||||
export HF_MODEL_DIR=hf_models/exaone4
|
||||
git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B $HF_MODEL_DIR
|
||||
```
|
||||
|
||||
### Pytorch flow
|
||||
### K-EXAONE
|
||||
|
||||
K-EXAONE is a Mixture of Experts (MoE) model based on the EXAONE architecture. It features a hybrid architecture with both dense and MoE layers, sliding window attention, and supports FP8 and NVFP4 quantization for efficient inference.
|
||||
|
||||
Download the HuggingFace checkpoints of the K-EXAONE model:
|
||||
|
||||
```bash
|
||||
export HF_MODEL_DIR=hf_models/kexaone
|
||||
git clone https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B $HF_MODEL_DIR
|
||||
```
|
||||
|
||||
## PyTorch flow
|
||||
|
||||
### Running EXAONE-4.0
|
||||
To quickly run EXAONE-4.0 models, you can use [examples/llm-api/quickstart_advanced.py](../../../llm-api/quickstart_advanced.py):
|
||||
|
||||
```bash
|
||||
python ../../../llm-api/quickstart_advanced.py --model_dir hf_models/$MODEL_NAME --disable_kv_cache_reuse
|
||||
python ../../../llm-api/quickstart_advanced.py --model_dir $HF_MODEL_DIR
|
||||
```
|
||||
|
||||
SWA currently does not support kv_cache_reuse. Please make sure to disable KV cache reuse when running with SWA.
|
||||
|
||||
The output will be like:
|
||||
```bash
|
||||
[0] Prompt: 'Hello, my name is', Generated text: " [Your Name], and I'm a [Your Profession]. I'm here to learn and share with you.\n\nBest regards,\n[Your Name]\n\nThis letter is concise, professional, and clearly states who you are and what you're here for. It's a good starting point"
|
||||
@ -83,47 +110,239 @@ The output will be like:
|
||||
[2] Prompt: 'The future of AI is', Generated text: ' not just about technology but also about how we choose to use it. We must ensure that AI is developed and deployed in a way that benefits all of humanity, not just a select few. This means prioritizing ethical considerations, transparency, and accountability in AI development. It also means involving diverse stakeholders in the conversation about AI'
|
||||
```
|
||||
|
||||
#### PyTorch flow Quantization
|
||||
### Running K-EXAONE
|
||||
|
||||
For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command:
|
||||
K-EXAONE is a Mixture of Experts model that benefits from multiple parallelism strategies. You can run it with tensor parallelism (TP), expert parallelism (EP), and attention data parallelism (ADP):
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer.git
|
||||
python ../../../llm-api/quickstart_advanced.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--tp_size 8 \
|
||||
--moe_ep_size 8 \
|
||||
--enable_attention_dp \
|
||||
--trust_remote_code
|
||||
```
|
||||
The output will be like:
|
||||
```bash
|
||||
[0] Prompt: 'Hello, my name is', Generated text: ' John Smith, and I am a 28-year-old software developer. I live in the city of San Francisco, California. I work remotely for a tech startup based in Austin, Texas.\n\nI enjoy hiking, reading, and playing the piano. In my free time, I often explore new neighborhoods in San Francisco, trying out new restaurants and cafes.\n\n'
|
||||
[1] Prompt: 'The capital of France is', Generated text: ' Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris'
|
||||
[2] Prompt: 'The future of AI is', Generated text: ' bright.\n</think>\n\nThe future of AI holds immense promise across numerous domains. In healthcare, AI is revolutionizing diagnostics, drug discovery, and personalized treatment plans. In education, AI is enabling adaptive learning platforms that cater to individual learning styles and paces. In environmental science, AI is playing a pivotal role in addressing climate change by optimizing'
|
||||
```
|
||||
|
||||
#### MoE Backend Options
|
||||
|
||||
K-EXAONE supports the following MoE backends:
|
||||
|
||||
| Backend | Description |
|
||||
|---------|-------------|
|
||||
| `CUTLASS` | Default backend, optimized for general use cases |
|
||||
| `TRTLLM` | TensorRT-LLM backend using TRT-LLM Gen kernels, optimized for low-latency inference |
|
||||
| `WIDEEP` | Wide expert parallelism backend for cases where EP size exceeds the number of experts |
|
||||
|
||||
You can specify the MoE backend using the `--moe_backend` argument:
|
||||
|
||||
```bash
|
||||
python ../../../llm-api/quickstart_advanced.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--tp_size 8 \
|
||||
--moe_ep_size 8 \
|
||||
--enable_attention_dp \
|
||||
--moe_backend CUTLASS \
|
||||
--trust_remote_code
|
||||
```
|
||||
|
||||
### PyTorch flow Quantization
|
||||
|
||||
For PyTorch flow, TRT-LLM supports quantized formats generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). You can either use pre-quantized models from the HuggingFace model hub, or generate quantized models yourself using the instructions below.
|
||||
|
||||
First, clone the [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/Model-Optimizer
|
||||
cd Model-Optimizer/examples/llm_ptq
|
||||
scripts/huggingface_example.sh --model hf_models/$MODEL_NAME --quant fp8 --export_fmt hf
|
||||
```
|
||||
|
||||
For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
|
||||
For more information, please refer to the official [Model Optimizer documentation](https://github.com/NVIDIA/Model-Optimizer).
|
||||
|
||||
Troubleshooting
|
||||
#### FP8 Quantization
|
||||
|
||||
FP8 quantization provides a good balance between model accuracy and inference performance. To quantize a model to FP8 format:
|
||||
|
||||
The following error may occur during quantization:
|
||||
```bash
|
||||
torch._dynamo.exc.Unsupported: Graph break under GenericContextWrappingVariable
|
||||
Explanation: Attempted to graph break in an active context manager(s) that doesn't support graph breaking.
|
||||
Hint: Move the offending context manager(s) to outside the compiled region.
|
||||
Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
|
||||
python3 hf_ptq.py --model $HF_MODEL_DIR --quant fp8 --export_fmt hf
|
||||
```
|
||||
|
||||
This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
|
||||
#### NVFP4 Quantization
|
||||
|
||||
Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
|
||||
```json
|
||||
# generation_config.json
|
||||
{
|
||||
// Change "hybrid" to "dynamic" to run PTQ.
|
||||
// Revert this to "hybrid" after quantization is complete.
|
||||
"cache_implementation": "hybrid",
|
||||
...
|
||||
}
|
||||
NVFP4 (4-bit floating point) quantization enables memory-efficient inference with reduced GPU memory footprint. To quantize a model to NVFP4 format:
|
||||
|
||||
```bash
|
||||
python3 hf_ptq.py --model $HF_MODEL_DIR --quant nvfp4 --export_fmt hf
|
||||
```
|
||||
For models with sliding window attention, DynamicCache is less memory-efficient than HybridCache because it retains the entire key-value cache. However, this does not break the model's attention logic, as the cache implementation is separated from the attention computation itself. This trade-off is acceptable for the PTQ process, which is a one-time procedure. Our tests confirm that this workaround does not degrade accuracy on MMLU or GSM8K benchmarks with the default ModelOpt settings.
|
||||
|
||||
### TRT flow
|
||||
## Running the TensorRT LLM Server
|
||||
|
||||
The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`.
|
||||
This section describes how to deploy the K-EXAONE model using the TensorRT LLM server with an OpenAI-compatible API endpoint.
|
||||
Make sure `HF_MODEL_DIR` points to your EXAONE checkpoint directory.
|
||||
|
||||
The examples in this section are intended as a minimal, runnable demonstration and are not fully performance-optimized. For more features and performance tuning, please refer the documents below.
|
||||
- [Disaggregated Serving examples](../../../disaggregated/README.md)
|
||||
- [Disaggregated Serving feature guide](../../../../docs/source/features/disagg-serving.md)
|
||||
- [Recommended LLM API configuration settings](../../../configs/README.md) (see also `examples/configs/curated/`)
|
||||
|
||||
### Running Aggregated TensorRT LLM Server
|
||||
|
||||
The aggregated server runs all components (context and generation phases) on the same set of GPUs, which is suitable for single-node deployments.
|
||||
|
||||
#### Creating the Extra Options Configuration
|
||||
|
||||
Create a YAML configuration file to specify advanced options such as attention data parallelism, CUDA graph settings, and MoE backend configuration:
|
||||
|
||||
```bash
|
||||
cat <<EOF > configs.yaml
|
||||
enable_attention_dp: true
|
||||
trust_remote_code: true
|
||||
cuda_graph_config:
|
||||
max_batch_size: 2048
|
||||
enable_padding: true
|
||||
moe_config:
|
||||
backend: CUTLASS # The TRTLLM backend is recommended for the Blackwell architecture.
|
||||
kv_cache_config:
|
||||
enable_block_reuse: true # Please disable the block reuse feature when conducting performance benchmarking.
|
||||
max_attention_window: [128, 128, 128, 131072] # This allows KV cache manager to possibly improve memory efficiency.
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: "auto"
|
||||
attention_dp_config:
|
||||
enable_balance: true
|
||||
batching_wait_iters: 50
|
||||
timeout_iters: 1
|
||||
num_postprocess_workers: 4 # Can mitigate the postprocessing overhead (e.g. detokenization)
|
||||
EOF
|
||||
```
|
||||
|
||||
#### Launch trtllm-serve OpenAI-compatible API server
|
||||
|
||||
Start the server using `trtllm-serve` with the PyTorch backend. This launches an OpenAI-compatible API server that can handle chat completions and text generation requests:
|
||||
|
||||
```bash
|
||||
trtllm-serve \
|
||||
$HF_MODEL_DIR \
|
||||
--host localhost \
|
||||
--port 8000 \
|
||||
--backend pytorch \
|
||||
--max_batch_size 2048 \
|
||||
--max_num_tokens 8192 \
|
||||
--tp_size 8 \
|
||||
--ep_size 8 \
|
||||
--pp_size 1 \
|
||||
--config ./configs.yaml
|
||||
```
|
||||
|
||||
Once the server is running, you can send requests to `http://localhost:8000/v1/completions` using the OpenAI API format.
|
||||
|
||||
### Running Disaggregated TensorRT LLM Server
|
||||
|
||||
Disaggregated serving separates the context (prefill) and generation (decode) phases onto different GPU sets, enabling better resource utilization and improved throughput. This example demonstrates a single-node disaggregated deployment using 8 GPUs (4 for context, 4 for generation). For more details, see the [Disaggregated Serving documentation](../../../disaggregated/README.md).
|
||||
|
||||
#### Step 1: Set Environment Variables
|
||||
|
||||
Configure the parallelism and buffer settings:
|
||||
|
||||
```bash
|
||||
# Buffer size for KV cache transfer between context and generation servers
|
||||
export MAX_TOKENS_IN_BUFFER=8192
|
||||
|
||||
# Model parallelism configuration
|
||||
export TP_SIZE=4
|
||||
export MOE_EP_SIZE=4
|
||||
export ENABLE_ATTENTION_DP=true
|
||||
```
|
||||
|
||||
#### Step 2: Create Configuration Files
|
||||
|
||||
**Context server configuration (`ctx_extra-llm-api-config.yaml`):**
|
||||
|
||||
```bash
|
||||
cat > ctx_extra-llm-api-config.yaml << EOF
|
||||
backend: pytorch
|
||||
trust_remote_code: true
|
||||
disable_overlap_scheduler: true
|
||||
enable_chunked_prefill: true
|
||||
|
||||
tensor_parallel_size: $TP_SIZE
|
||||
moe_expert_parallel_size: $MOE_EP_SIZE
|
||||
pipeline_parallel_size: 1
|
||||
enable_attention_dp: $ENABLE_ATTENTION_DP
|
||||
|
||||
cache_transceiver_config:
|
||||
backend: UCX
|
||||
max_tokens_in_buffer: $MAX_TOKENS_IN_BUFFER
|
||||
EOF
|
||||
```
|
||||
|
||||
**Generation server configuration (`gen_extra-llm-api-config.yaml`):**
|
||||
|
||||
```bash
|
||||
cat > gen_extra-llm-api-config.yaml << EOF
|
||||
backend: pytorch
|
||||
trust_remote_code: true
|
||||
disable_overlap_scheduler: false
|
||||
enable_chunked_prefill: true
|
||||
|
||||
tensor_parallel_size: $TP_SIZE
|
||||
moe_expert_parallel_size: $MOE_EP_SIZE
|
||||
pipeline_parallel_size: 1
|
||||
enable_attention_dp: $ENABLE_ATTENTION_DP
|
||||
|
||||
cache_transceiver_config:
|
||||
backend: UCX
|
||||
max_tokens_in_buffer: $MAX_TOKENS_IN_BUFFER
|
||||
EOF
|
||||
```
|
||||
|
||||
**Disaggregated orchestrator configuration (`disagg_config.yaml`):**
|
||||
|
||||
```bash
|
||||
cat > disagg_config.yaml << EOF
|
||||
hostname: localhost
|
||||
port: 8000
|
||||
backend: pytorch
|
||||
context_servers:
|
||||
num_instances: 1
|
||||
urls:
|
||||
- "localhost:8001"
|
||||
generation_servers:
|
||||
num_instances: 1
|
||||
urls:
|
||||
- "localhost:8002"
|
||||
EOF
|
||||
```
|
||||
|
||||
#### Step 3: Launch the Disaggregated Server
|
||||
|
||||
Start all components in the following order:
|
||||
|
||||
```bash
|
||||
# 1. Start context server (GPUs 0-3)
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 trtllm-serve $HF_MODEL_DIR \
|
||||
--host localhost --port 8001 --enable_chunked_prefill \
|
||||
--extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx.log &
|
||||
|
||||
# 2. Start generation server (GPUs 4-7)
|
||||
CUDA_VISIBLE_DEVICES=4,5,6,7 trtllm-serve $HF_MODEL_DIR \
|
||||
--host localhost --port 8002 --enable_chunked_prefill \
|
||||
--extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen.log &
|
||||
|
||||
# 3. Start disaggregated orchestrator
|
||||
trtllm-serve disaggregated -c disagg_config.yaml -t 360 -r 1200 &> log_disagg.log &
|
||||
```
|
||||
|
||||
Once all servers are running, you can send requests to `http://localhost:8000/v1/completions` using the OpenAI API format.
|
||||
|
||||
|
||||
## TRT flow
|
||||
|
||||
The next section describes how to convert weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT LLM format. We will use LLaMA's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE models and then build the model with `trtllm-build`.
|
||||
|
||||
### Convert checkpoint and build TensorRT engine(s)
|
||||
|
||||
@ -141,7 +360,7 @@ trtllm-build \
|
||||
--output_dir trt_engines/exaone/fp16/1-gpu \
|
||||
--gemm_plugin auto
|
||||
|
||||
# Build the EXAONE model using a single GPU and and apply INT8 weight-only quantization.
|
||||
# Build the EXAONE model using a single GPU and apply INT8 weight-only quantization.
|
||||
python ../llama/convert_checkpoint.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--output_dir trt_models/exaone/int8_wq/1-gpu \
|
||||
@ -154,7 +373,7 @@ trtllm-build \
|
||||
--output_dir trt_engines/exaone/int8_wq/1-gpu \
|
||||
--gemm_plugin auto
|
||||
|
||||
# Build the EXAONE model using a single GPU and and apply INT4 weight-only quantization.
|
||||
# Build the EXAONE model using a single GPU and apply INT4 weight-only quantization.
|
||||
python ../llama/convert_checkpoint.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--output_dir trt_models/exaone/int4_wq/1-gpu \
|
||||
@ -183,18 +402,18 @@ trtllm-build \
|
||||
|
||||
### FP8 Post-Training Quantization
|
||||
|
||||
The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
|
||||
The examples below use the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
|
||||
|
||||
First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply FP8 quantization.
|
||||
# Build the EXAONE model using a single GPU and apply FP8 quantization.
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
--output_dir trt_models/exaone/fp8/1-gpu \
|
||||
--output_dir trt_models/exaone/fp8/1-gpu
|
||||
|
||||
trtllm-build \
|
||||
--checkpoint_dir trt_models/exaone/fp8/1-gpu \
|
||||
@ -204,12 +423,12 @@ trtllm-build \
|
||||
|
||||
### SmoothQuant
|
||||
|
||||
The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
|
||||
The examples below use the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
|
||||
|
||||
First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply INT8 SmoothQuant.
|
||||
# Build the EXAONE model using a single GPU and apply INT8 SmoothQuant.
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
@ -224,12 +443,12 @@ trtllm-build \
|
||||
|
||||
### Groupwise quantization (AWQ)
|
||||
|
||||
The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
|
||||
The examples below use the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
|
||||
|
||||
First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply INT4 AWQ.
|
||||
# Build the EXAONE model using a single GPU and apply INT4 AWQ.
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
@ -248,7 +467,7 @@ For Hopper GPUs, TRT-LLM also supports employing FP8 GEMM for accelerating linea
|
||||
Please make sure your system contains a Hopper GPU before trying the commands below.
|
||||
|
||||
```bash
|
||||
# Build the EXAONE model using a single GPU and and apply W4A8 AWQ.
|
||||
# Build the EXAONE model using a single GPU and apply W4A8 AWQ.
|
||||
python ../../../quantization/quantize.py \
|
||||
--model_dir $HF_MODEL_DIR \
|
||||
--dtype float16 \
|
||||
@ -287,4 +506,50 @@ python ../../../summarize.py \
|
||||
--engine_dir trt_engines/exaone/fp16/1-gpu
|
||||
```
|
||||
|
||||
For more examples see [`examples/models/core/llama/README.md`](../llama/README.md)
|
||||
For more examples regarding EXAONE-3.0 & EXAONE-Deep's TRT flow, see [`examples/models/core/llama/README.md`](../llama/README.md)
|
||||
|
||||
|
||||
|
||||
## Troubleshootings
|
||||
|
||||
### Troubleshootings for EXAONE-4.0
|
||||
|
||||
The following error may occur during quantization:
|
||||
```bash
|
||||
torch._dynamo.exc.Unsupported: Graph break under GenericContextWrappingVariable
|
||||
Explanation: Attempted to graph break in an active context manager(s) that doesn't support graph breaking.
|
||||
Hint: Move the offending context manager(s) to outside the compiled region.
|
||||
Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
|
||||
```
|
||||
|
||||
This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
|
||||
|
||||
Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
|
||||
```json
|
||||
# generation_config.json
|
||||
{
|
||||
// Change "hybrid" to "dynamic" to run PTQ.
|
||||
// Revert this to "hybrid" after quantization is complete.
|
||||
"cache_implementation": "hybrid",
|
||||
...
|
||||
}
|
||||
```
|
||||
For models with sliding window attention, DynamicCache is less memory-efficient than HybridCache because it retains the entire key-value cache. However, this does not break the model's attention logic, as the cache implementation is separated from the attention computation itself. This trade-off is acceptable for the PTQ process, which is a one-time procedure. Our tests confirm that this workaround does not degrade accuracy on MMLU or GSM8K benchmarks with the default ModelOpt settings.
|
||||
|
||||
### Troubleshootings for K-EXAONE
|
||||
|
||||
K-EXAONE is a Mixture of Experts (MoE) model which activates 8 experts per token. When not enough tokens are given during the PTQ, some experts on some layers might not be activated and will not produce proper weights.
|
||||
|
||||
To address this issue, provide enough data samples during calibration by increasing `calib_size` and `calib_seq` parameters:
|
||||
|
||||
**FP8 Quantization:**
|
||||
```bash
|
||||
cd Model-Optimizer/examples/llm_ptq
|
||||
python3 hf_ptq.py --model hf_models/$MODEL_NAME --quant fp8 --export_fmt hf --calib_size 8192 --calib_seq 1024
|
||||
```
|
||||
|
||||
**NVFP4 Quantization:**
|
||||
```bash
|
||||
cd Model-Optimizer/examples/llm_ptq
|
||||
python3 hf_ptq.py --model hf_models/$MODEL_NAME --quant nvfp4 --export_fmt hf --calib_size 8192 --calib_seq 1024
|
||||
```
|
||||
|
||||
@ -10,7 +10,7 @@ tiktoken
|
||||
einops
|
||||
|
||||
# optional dependencies
|
||||
gradio==4.44.1
|
||||
gradio==5.4.0
|
||||
mdtex2html
|
||||
sse_starlette
|
||||
aiohttp_sse_client
|
||||
|
||||
@ -1155,7 +1155,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
export pytestCommand="$pytestCommand"
|
||||
export coverageConfigFile="$coverageConfigFile"
|
||||
export NVIDIA_IMEX_CHANNELS=\${NVIDIA_IMEX_CHANNELS:-0}
|
||||
export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))}
|
||||
export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=csv,noheader)-1)))}
|
||||
${envExportStatements}
|
||||
|
||||
echo "Env NVIDIA_IMEX_CHANNELS: \$NVIDIA_IMEX_CHANNELS"
|
||||
@ -3249,10 +3249,12 @@ def launchTestJobs(pipeline, testFilter)
|
||||
fullSet = parallelJobs.keySet()
|
||||
|
||||
x86SlurmTestConfigs = [
|
||||
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
|
||||
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 2, 2],
|
||||
"DGX_H100-2_GPUs-PyTorch-Others-2": ["dgx-h100-x2-oci", "l0_dgx_h100", 2, 2, 2],
|
||||
"DGX_H100-2_GPUs-PyTorch-GptOss-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
|
||||
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
|
||||
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 2, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4-oci", "l0_dgx_h100", 2, 2, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
|
||||
111
jenkins/runPerfSanityTriage.groovy
Normal file
111
jenkins/runPerfSanityTriage.groovy
Normal file
@ -0,0 +1,111 @@
|
||||
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
|
||||
|
||||
import java.lang.InterruptedException
|
||||
|
||||
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202510291120-8621"
|
||||
|
||||
// LLM repository configuration
|
||||
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
|
||||
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
|
||||
}
|
||||
LLM_ROOT = "llm"
|
||||
|
||||
def createKubernetesPodConfig(image, arch = "amd64")
|
||||
{
|
||||
def archSuffix = arch == "arm64" ? "arm" : "amd"
|
||||
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
|
||||
|
||||
def podConfig = [
|
||||
cloud: "kubernetes-cpu",
|
||||
namespace: "sw-tensorrt",
|
||||
yaml: """
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
spec:
|
||||
nodeSelector:
|
||||
nvidia.com/node_type: builder
|
||||
kubernetes.io/os: linux
|
||||
containers:
|
||||
- name: trt-llm
|
||||
image: ${image}
|
||||
command: ['cat']
|
||||
volumeMounts:
|
||||
- name: sw-tensorrt-pvc
|
||||
mountPath: "/mnt/sw-tensorrt-pvc"
|
||||
readOnly: false
|
||||
tty: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 5Gi
|
||||
ephemeral-storage: 25Gi
|
||||
limits:
|
||||
cpu: 2
|
||||
memory: 5Gi
|
||||
ephemeral-storage: 25Gi
|
||||
imagePullPolicy: Always
|
||||
- name: jnlp
|
||||
image: ${jnlpImage}
|
||||
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
|
||||
resources:
|
||||
requests:
|
||||
cpu: '2'
|
||||
memory: 5Gi
|
||||
ephemeral-storage: 25Gi
|
||||
limits:
|
||||
cpu: '2'
|
||||
memory: 5Gi
|
||||
ephemeral-storage: 25Gi
|
||||
qosClass: Guaranteed
|
||||
volumes:
|
||||
- name: sw-tensorrt-pvc
|
||||
persistentVolumeClaim:
|
||||
claimName: sw-tensorrt-pvc
|
||||
""".stripIndent(),
|
||||
]
|
||||
|
||||
return podConfig
|
||||
}
|
||||
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig(DOCKER_IMAGE)
|
||||
}
|
||||
options {
|
||||
timestamps()
|
||||
}
|
||||
environment {
|
||||
OPEN_SEARCH_DB_BASE_URL=credentials("open_search_db_base_url")
|
||||
OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
|
||||
}
|
||||
parameters {
|
||||
string(name: "BRANCH", defaultValue: "main", description: "Branch to checkout.")
|
||||
string(name: "OPEN_SEARCH_PROJECT_NAME", defaultValue: "swdl-trtllm-infra-ci-prod-perf_sanity_info", description: "OpenSearch project name.")
|
||||
string(name: "OPERATION", defaultValue: "SLACK BOT SENDS MESSAGE", description: "Operation to perform.")
|
||||
string(name: "QUERY_JOB_NUMBER", defaultValue: "1", description: "Number of latest jobs to query.")
|
||||
string(name: "SLACK_CHANNEL_ID", defaultValue: "C0A7D0LCA1F", description: "Slack channel IDs to send messages to.")
|
||||
string(name: "SLACK_BOT_TOKEN", defaultValue: "", description: "Slack bot token for authentication.")
|
||||
}
|
||||
stages {
|
||||
stage("Run Perf Sanity Script") {
|
||||
steps {
|
||||
container("trt-llm") {
|
||||
script {
|
||||
sh "pwd && ls -alh"
|
||||
sh "env | sort"
|
||||
trtllm_utils.checkoutSource(LLM_REPO, params.BRANCH, LLM_ROOT, false, false)
|
||||
sh "pip install slack_sdk"
|
||||
sh """
|
||||
cd ${LLM_ROOT}/jenkins/scripts/perf && ls -alh && python3 perf_sanity_triage.py \
|
||||
--project_name "${params.OPEN_SEARCH_PROJECT_NAME}" \
|
||||
--operation "${params.OPERATION}" \
|
||||
--channel_id "${params.SLACK_CHANNEL_ID}" \
|
||||
--bot_token "${params.SLACK_BOT_TOKEN}" \
|
||||
--query_job_number "${params.QUERY_JOB_NUMBER}"
|
||||
"""
|
||||
}
|
||||
}
|
||||
}
|
||||
} // stage Run Perf Sanity Script
|
||||
} // stages
|
||||
} // pipeline
|
||||
251
jenkins/scripts/perf/perf_sanity_triage.py
Normal file
251
jenkins/scripts/perf/perf_sanity_triage.py
Normal file
@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
|
||||
from slack_sdk import WebClient
|
||||
from slack_sdk.errors import SlackApiError
|
||||
|
||||
sys.path.insert(0, sys.path[0] + "/..")
|
||||
from open_search_db import OpenSearchDB
|
||||
|
||||
QUERY_LOOKBACK_DAYS = 90
|
||||
MAX_QUERY_SIZE = 3000
|
||||
MAX_TEST_CASES_PER_MSG = 5
|
||||
POST_SLACK_MSG_RETRY_TIMES = 5
|
||||
|
||||
|
||||
def query_regression_data(project_name):
|
||||
"""Query regression data from OpenSearch database."""
|
||||
last_days = QUERY_LOOKBACK_DAYS
|
||||
|
||||
must_clauses = [
|
||||
{"term": {"b_is_valid": True}},
|
||||
{"term": {"b_is_post_merge": True}},
|
||||
{"term": {"b_is_regression": True}},
|
||||
{"term": {"b_is_baseline": False}},
|
||||
{
|
||||
"range": {
|
||||
"ts_created": {
|
||||
"gte": int(time.time() - 24 * 3600 * last_days)
|
||||
// (24 * 3600)
|
||||
* 24
|
||||
* 3600
|
||||
* 1000,
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
json_data = {
|
||||
"query": {
|
||||
"bool": {"must": must_clauses},
|
||||
},
|
||||
"size": MAX_QUERY_SIZE,
|
||||
}
|
||||
json_data = json.dumps(json_data)
|
||||
|
||||
data_list = []
|
||||
try:
|
||||
res = OpenSearchDB.queryFromOpenSearchDB(json_data, project_name)
|
||||
if res is None:
|
||||
print(f"Failed to query from {project_name}, returned no response")
|
||||
return None
|
||||
payload = res.json().get("hits", {}).get("hits", [])
|
||||
if len(payload) == 0:
|
||||
print(f"No regression data found in {project_name}, returned empty list")
|
||||
return []
|
||||
for hit in payload:
|
||||
data_dict = hit.get("_source", {})
|
||||
data_dict["_id"] = hit.get("_id", "")
|
||||
if data_dict["_id"] == "":
|
||||
print(f"Failed to query from {project_name}, returned data with no _id")
|
||||
return None
|
||||
data_list.append(data_dict)
|
||||
print(f"Successfully queried from {project_name}, queried {len(data_list)} entries")
|
||||
return data_list
|
||||
except Exception as e:
|
||||
print(f"Failed to query from {project_name}, returned error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_regression_data_by_job_id(data_list, query_job_number):
|
||||
"""Returns a dict with job_id as key and list of regression data as value.
|
||||
|
||||
Only returns the latest query_job_number jobs.
|
||||
"""
|
||||
if data_list is None or len(data_list) == 0:
|
||||
return {}
|
||||
|
||||
# Group data by job_id
|
||||
job_data_dict = {}
|
||||
for data in data_list:
|
||||
job_id = data.get("s_job_id", "")
|
||||
if job_id == "":
|
||||
continue
|
||||
if job_id not in job_data_dict:
|
||||
job_data_dict[job_id] = []
|
||||
job_data_dict[job_id].append(data)
|
||||
|
||||
# Sort job_ids by the latest ts_created in each group (descending)
|
||||
def get_latest_timestamp(job_id):
|
||||
timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]]
|
||||
return max(timestamps) if timestamps else 0
|
||||
|
||||
sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True)
|
||||
|
||||
# Only keep the latest query_job_number jobs
|
||||
latest_job_ids = sorted_job_ids[:query_job_number]
|
||||
|
||||
result = {}
|
||||
for job_id in latest_job_ids:
|
||||
result[job_id] = job_data_dict[job_id]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_regression_message(regression_dict):
|
||||
"""Process regression data into message chunks.
|
||||
|
||||
Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases.
|
||||
"""
|
||||
if not regression_dict:
|
||||
return []
|
||||
|
||||
# Flatten all test cases into a list with (job_id, idx, data) tuples
|
||||
all_test_cases = []
|
||||
for job_id, data_list in regression_dict.items():
|
||||
sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", ""))
|
||||
for idx, data in enumerate(sorted_data_list, start=1):
|
||||
all_test_cases.append((job_id, idx, data))
|
||||
|
||||
# Split into chunks of MAX_TEST_CASES_PER_MSG
|
||||
chunks = []
|
||||
for i in range(0, len(all_test_cases), MAX_TEST_CASES_PER_MSG):
|
||||
chunks.append(all_test_cases[i : i + MAX_TEST_CASES_PER_MSG])
|
||||
|
||||
# Build messages for each chunk
|
||||
messages = []
|
||||
for chunk in chunks:
|
||||
msg_parts = []
|
||||
current_job_id = None
|
||||
for job_id, idx, data in chunk:
|
||||
# Add job header when switching to a new job_id
|
||||
if job_id != current_job_id:
|
||||
if msg_parts:
|
||||
msg_parts.append("\n")
|
||||
job_header = f"*LLM/main/L0_PostMerge/{job_id}:*\n"
|
||||
msg_parts.append(job_header)
|
||||
current_job_id = job_id
|
||||
|
||||
test_case_name = data.get("s_test_case_name", "N/A")
|
||||
regression_info = data.get("s_regression_info", "N/A")
|
||||
msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n")
|
||||
for part in regression_info.split(","):
|
||||
part = part.strip()
|
||||
if part and "baseline_id" not in part:
|
||||
msg_parts.append(f" {part}\n")
|
||||
|
||||
msg = "".join(msg_parts).strip()
|
||||
messages.append(msg)
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def send_regression_message(messages, channel_id, bot_token):
|
||||
"""Send regression messages to Slack channel(s).
|
||||
|
||||
channel_id can be a single ID or multiple IDs separated by commas.
|
||||
"""
|
||||
if not messages:
|
||||
print("No regression data to send")
|
||||
return
|
||||
|
||||
if channel_id and bot_token:
|
||||
channel_ids = [cid.strip() for cid in channel_id.split(",") if cid.strip()]
|
||||
for cid in channel_ids:
|
||||
for msg in messages:
|
||||
send_message(msg, cid, bot_token)
|
||||
else:
|
||||
print("Slack channel_id or bot_token not provided, printing message:")
|
||||
for i, msg in enumerate(messages, start=1):
|
||||
print(f"--- Message {i} ---")
|
||||
print(msg)
|
||||
|
||||
|
||||
def send_message(msg, channel_id, bot_token):
|
||||
"""Send message to Slack channel using slack_sdk."""
|
||||
client = WebClient(token=bot_token)
|
||||
|
||||
attachments = [
|
||||
{
|
||||
"title": "Perf Sanity Regression Report",
|
||||
"color": "#ff0000",
|
||||
"text": msg,
|
||||
}
|
||||
]
|
||||
|
||||
for attempt in range(1, POST_SLACK_MSG_RETRY_TIMES + 1):
|
||||
try:
|
||||
result = client.chat_postMessage(
|
||||
channel=channel_id,
|
||||
attachments=attachments,
|
||||
)
|
||||
assert result["ok"] is True, json.dumps(result.data)
|
||||
print(f"Message sent successfully to channel {channel_id}")
|
||||
return
|
||||
except SlackApiError as e:
|
||||
print(
|
||||
f"Attempt {attempt}/{POST_SLACK_MSG_RETRY_TIMES}: Error sending message to Slack: {e}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt}/{POST_SLACK_MSG_RETRY_TIMES}: Unexpected error: {e}")
|
||||
|
||||
if attempt < POST_SLACK_MSG_RETRY_TIMES:
|
||||
time.sleep(1)
|
||||
|
||||
print(
|
||||
f"Failed to send message to channel {channel_id} after {POST_SLACK_MSG_RETRY_TIMES} attempts"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Perf Sanity Triage Script")
|
||||
parser.add_argument("--project_name", type=str, required=True, help="OpenSearch project name")
|
||||
parser.add_argument("--operation", type=str, required=True, help="Operation to perform")
|
||||
parser.add_argument(
|
||||
"--channel_id",
|
||||
type=str,
|
||||
default="",
|
||||
help="Slack channel ID(s), comma-separated for multiple channels",
|
||||
)
|
||||
parser.add_argument("--bot_token", type=str, default="", help="Slack bot token")
|
||||
parser.add_argument(
|
||||
"--query_job_number", type=int, default=1, help="Number of latest jobs to query"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Project Name: {args.project_name}")
|
||||
print(f"Operation: {args.operation}")
|
||||
print(f"Channel ID: {args.channel_id}")
|
||||
print(f"Bot Token: {'***' if args.bot_token else 'Not provided'}")
|
||||
print(f"Query Job Number: {args.query_job_number}")
|
||||
|
||||
if args.operation == "SLACK BOT SENDS MESSAGE":
|
||||
data_list = query_regression_data(args.project_name)
|
||||
if data_list is None:
|
||||
print("Failed to query regression data")
|
||||
return
|
||||
|
||||
regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number)
|
||||
messages = process_regression_message(regression_dict)
|
||||
send_regression_message(messages, args.channel_id, args.bot_token)
|
||||
else:
|
||||
print(f"Unknown operation: {args.operation}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -40,6 +40,14 @@ TEST_LIST_PATH = (
|
||||
REPO_ROOT / "tests" / "integration" / "test_lists" / "qa" / "llm_config_database.yml"
|
||||
)
|
||||
ITERATIONS = 10
|
||||
# Mapping from HuggingFace model IDs to MODEL_PATH_DICT keys used by the test framework
|
||||
# in tests/integration/defs/perf/test_perf_sanity.py
|
||||
MODEL_NAME_MAPPING = {
|
||||
"deepseek-ai/DeepSeek-R1-0528": "deepseek_r1_0528_fp8",
|
||||
"nvidia/DeepSeek-R1-0528-FP4-v2": "deepseek_r1_0528_fp4_v2",
|
||||
"openai/gpt-oss-120b": "gpt_oss_120b_fp4",
|
||||
}
|
||||
|
||||
|
||||
# GPU type to condition wildcards mapping for test list
|
||||
# Note: cpu is used to distinguish between e.g. H200_SXM and GH200
|
||||
@ -65,9 +73,13 @@ def generate_client_name(recipe: Recipe) -> str:
|
||||
|
||||
def recipe_to_server_config(recipe: Recipe, llm_api_config: dict) -> dict:
|
||||
"""Convert a recipe + LLM API config to aggr_server format."""
|
||||
model_name = MODEL_NAME_MAPPING.get(recipe.model)
|
||||
if not model_name:
|
||||
raise ValueError(f"Model not found in MODEL_NAME_MAPPING: {recipe.model}")
|
||||
|
||||
server_config = {
|
||||
"name": generate_server_name(recipe),
|
||||
"model_name": recipe.model,
|
||||
"model_name": model_name,
|
||||
"gpus": recipe.num_gpus,
|
||||
# Enable scenario-only matching for baseline comparison
|
||||
"match_mode": "scenario",
|
||||
@ -157,7 +169,7 @@ def generate_condition_entry(
|
||||
}
|
||||
|
||||
tests = [
|
||||
f"perf/test_perf.py::test_perf[perf_sanity_upload-{config_name}-{name}]"
|
||||
f"perf/test_perf_sanity.py::test_e2e[aggr_upload-{config_name}-{name}]"
|
||||
for name in server_names
|
||||
]
|
||||
return {"condition": condition, "tests": tests}
|
||||
|
||||
91
security_scanning/cpp/kernels/fmha_v2/poetry.lock
generated
91
security_scanning/cpp/kernels/fmha_v2/poetry.lock
generated
@ -150,53 +150,58 @@ testing = ["filelock"]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.3.0"
|
||||
version = "2.4.0"
|
||||
description = "A lil' TOML parser"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"},
|
||||
{file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"},
|
||||
{file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"},
|
||||
{file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"},
|
||||
{file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
91
security_scanning/docs/poetry.lock
generated
91
security_scanning/docs/poetry.lock
generated
@ -1119,53 +1119,58 @@ test = ["pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.3.0"
|
||||
version = "2.4.0"
|
||||
description = "A lil' TOML parser"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"},
|
||||
{file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"},
|
||||
{file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"},
|
||||
{file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"},
|
||||
{file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@ -2396,80 +2396,80 @@ test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis
|
||||
|
||||
[[package]]
|
||||
name = "scipy"
|
||||
version = "1.16.3"
|
||||
version = "1.17.0"
|
||||
description = "Fundamental algorithms for scientific computing in Python"
|
||||
optional = false
|
||||
python-versions = ">=3.11"
|
||||
files = [
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562"},
|
||||
{file = "scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:2abd71643797bd8a106dff97894ff7869eeeb0af0f7a5ce02e4227c6a2e9d6fd"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ef28d815f4d2686503e5f4f00edc387ae58dfd7a2f42e348bb53359538f01558"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:272a9f16d6bb4667e8b50d25d71eddcc2158a214df1b566319298de0939d2ab7"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7204fddcbec2fe6598f1c5fdf027e9f259106d05202a959a9f1aecf036adc9f6"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fc02c37a5639ee67d8fb646ffded6d793c06c5622d36b35cfa8fe5ececb8f042"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dac97a27520d66c12a34fd90a4fe65f43766c18c0d6e1c0a80f114d2260080e4"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb7446a39b3ae0fe8f416a9a3fdc6fba3f11c634f680f16a239c5187bc487c0"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:474da16199f6af66601a01546144922ce402cb17362e07d82f5a6cf8f963e449"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:255c0da161bd7b32a6c898e7891509e8a9289f0b1c6c7d96142ee0d2b114c2ea"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-win_arm64.whl", hash = "sha256:85b0ac3ad17fa3be50abd7e69d583d98792d7edc08367e01445a1e2076005379"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:0d5018a57c24cb1dd828bcf51d7b10e65986d549f52ef5adb6b4d1ded3e32a57"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:88c22af9e5d5a4f9e027e26772cc7b5922fab8bcc839edb3ae33de404feebd9e"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f3cd947f20fe17013d401b64e857c6b2da83cae567adbb75b9dcba865abc66d8"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e8c0b331c2c1f531eb51f1b4fc9ba709521a712cce58f1aa627bc007421a5306"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5194c445d0a1c7a6c1a4a4681b6b7c71baad98ff66d96b949097e7513c9d6742"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9eeb9b5f5997f75507814ed9d298ab23f62cf79f5a3ef90031b1ee2506abdb5b"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:40052543f7bbe921df4408f46003d6f01c6af109b9e2c8a66dd1cf6cf57f7d5d"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0cf46c8013fec9d3694dc572f0b54100c28405d55d3e2cb15e2895b25057996e"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:0937a0b0d8d593a198cededd4c439a0ea216a3f36653901ea1f3e4be949056f8"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:f603d8a5518c7426414d1d8f82e253e454471de682ce5e39c29adb0df1efb86b"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:65ec32f3d32dfc48c72df4291345dae4f048749bc8d5203ee0a3f347f96c5ce6"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:1f9586a58039d7229ce77b52f8472c972448cded5736eaf102d5658bbac4c269"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9fad7d3578c877d606b1150135c2639e9de9cecd3705caa37b66862977cc3e72"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:423ca1f6584fc03936972b5f7c06961670dbba9f234e71676a7c7ccf938a0d61"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe508b5690e9eaaa9467fc047f833af58f1152ae51a0d0aed67aa5801f4dd7d6"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eec3842ec9ac9de5917899b277428886042a93db0b227ebbe3a333b64ec7643d"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d7425fcafbc09a03731e1bc05581f5fad988e48c6a861f441b7ab729a49a55ea"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:87b411e42b425b84777718cc41516b8a7e0795abfa8e8e1d573bf0ef014f0812"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-win_arm64.whl", hash = "sha256:357ca001c6e37601066092e7c89cca2f1ce74e2a520ca78d063a6d2201101df2"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:ec0827aa4d36cb79ff1b81de898e948a51ac0b9b1c43e4a372c0508c38c0f9a3"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:819fc26862b4b3c73a60d486dbb919202f3d6d98c87cf20c223511429f2d1a97"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:363ad4ae2853d88ebcde3ae6ec46ccca903ea9835ee8ba543f12f575e7b07e4e"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:979c3a0ff8e5ba254d45d59ebd38cde48fce4f10b5125c680c7a4bfe177aab07"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:130d12926ae34399d157de777472bf82e9061c60cc081372b3118edacafe1d00"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e886000eb4919eae3a44f035e63f0fd8b651234117e8f6f29bad1cd26e7bc45"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:13c4096ac6bc31d706018f06a49abe0485f96499deb82066b94d19b02f664209"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cacbaddd91fcffde703934897c5cd2c7cb0371fac195d383f4e1f1c5d3f3bd04"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-win_amd64.whl", hash = "sha256:edce1a1cf66298cccdc48a1bdf8fb10a3bf58e8b58d6c3883dd1530e103f87c0"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-win_arm64.whl", hash = "sha256:30509da9dbec1c2ed8f168b8d8aa853bc6723fede1dbc23c7d43a56f5ab72a67"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:c17514d11b78be8f7e6331b983a65a7f5ca1fd037b95e27b280921fe5606286a"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:4e00562e519c09da34c31685f6acc3aa384d4d50604db0f245c14e1b4488bfa2"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f7df7941d71314e60a481e02d5ebcb3f0185b8d799c70d03d8258f6c80f3d467"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:aabf057c632798832f071a8dde013c2e26284043934f53b00489f1773b33527e"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a38c3337e00be6fd8a95b4ed66b5d988bac4ec888fd922c2ea9fe5fb1603dd67"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00fb5f8ec8398ad90215008d8b6009c9db9fa924fd4c7d6be307c6f945f9cd73"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f2a4942b0f5f7c23c7cd641a0ca1955e2ae83dedcff537e3a0259096635e186b"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:dbf133ced83889583156566d2bdf7a07ff89228fe0c0cb727f777de92092ec6b"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-win_amd64.whl", hash = "sha256:3625c631a7acd7cfd929e4e31d2582cf00f42fcf06011f59281271746d77e061"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-win_arm64.whl", hash = "sha256:9244608d27eafe02b20558523ba57f15c689357c85bdcfe920b1828750aa26eb"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:2b531f57e09c946f56ad0b4a3b2abee778789097871fc541e267d2eca081cff1"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:13e861634a2c480bd237deb69333ac79ea1941b94568d4b0efa5db5e263d4fd1"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:eb2651271135154aa24f6481cbae5cc8af1f0dd46e6533fb7b56aa9727b6a232"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:c5e8647f60679790c2f5c76be17e2e9247dc6b98ad0d3b065861e082c56e078d"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fb10d17e649e1446410895639f3385fd2bf4c3c7dfc9bea937bddcbc3d7b9ba"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8547e7c57f932e7354a2319fab613981cde910631979f74c9b542bb167a8b9db"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33af70d040e8af9d5e7a38b5ed3b772adddd281e3062ff23fec49e49681c38cf"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb55bb97d00f8b7ab95cb64f873eb0bf54d9446264d9f3609130381233483f"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1ff269abf702f6c7e67a4b7aad981d42871a11b9dd83c58d2d2ea624efbd1088"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-win_arm64.whl", hash = "sha256:031121914e295d9791319a1875444d55079885bbae5bdc9c5e0f2ee5f09d34ff"},
|
||||
{file = "scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
numpy = ">=1.25.2,<2.6"
|
||||
numpy = ">=1.26.4,<2.7"
|
||||
|
||||
[package.extras]
|
||||
dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"]
|
||||
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"]
|
||||
dev = ["click (<8.3.0)", "cython-lint (>=0.12.2)", "mypy (==1.10.0)", "pycodestyle", "ruff (>=0.12.0)", "spin", "types-psutil", "typing_extensions"]
|
||||
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)", "tabulate"]
|
||||
test = ["Cython", "array-api-strict (>=2.3.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest (>=8.0.0)", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
|
||||
|
||||
[[package]]
|
||||
|
||||
@ -42,6 +42,40 @@ files = [
|
||||
{file = "av-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:273a3e32de64819e4a1cd96341824299fe06f70c46f2288b5dc4173944f0fd62"},
|
||||
{file = "av-16.1.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:640f57b93f927fba8689f6966c956737ee95388a91bd0b8c8b5e0481f73513d6"},
|
||||
{file = "av-16.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ae3fb658eec00852ebd7412fdc141f17f3ddce8afee2d2e1cf366263ad2a3b35"},
|
||||
{file = "av-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ee558d9c02a142eebcbe55578a6d817fedfde42ff5676275504e16d07a7f86"},
|
||||
{file = "av-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7ae547f6d5fa31763f73900d43901e8c5fa6367bb9a9840978d57b5a7ae14ed2"},
|
||||
{file = "av-16.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8cf065f9d438e1921dc31fc7aa045790b58aee71736897866420d80b5450f62a"},
|
||||
{file = "av-16.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a345877a9d3cc0f08e2bc4ec163ee83176864b92587afb9d08dff50f37a9a829"},
|
||||
{file = "av-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:f49243b1d27c91cd8c66fdba90a674e344eb8eb917264f36117bf2b6879118fd"},
|
||||
{file = "av-16.1.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:ce2a1b3d8bf619f6c47a9f28cfa7518ff75ddd516c234a4ee351037b05e6a587"},
|
||||
{file = "av-16.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:408dbe6a2573ca58a855eb8cd854112b33ea598651902c36709f5f84c991ed8e"},
|
||||
{file = "av-16.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:57f657f86652a160a8a01887aaab82282f9e629abf94c780bbdbb01595d6f0f7"},
|
||||
{file = "av-16.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:adbad2b355c2ee4552cac59762809d791bda90586d134a33c6f13727fb86cb3a"},
|
||||
{file = "av-16.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f42e1a68ec2aebd21f7eb6895be69efa6aa27eec1670536876399725bbda4b99"},
|
||||
{file = "av-16.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58fe47aeaef0f100c40ec8a5de9abbd37f118d3ca03829a1009cf288e9aef67c"},
|
||||
{file = "av-16.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:565093ebc93b2f4b76782589564869dadfa83af5b852edebedd8fee746457d06"},
|
||||
{file = "av-16.1.0-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:574081a24edb98343fd9f473e21ae155bf61443d4ec9d7708987fa597d6b04b2"},
|
||||
{file = "av-16.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:9ab00ea29c25ebf2ea1d1e928d7babb3532d562481c5d96c0829212b70756ad0"},
|
||||
{file = "av-16.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a84a91188c1071f238a9523fd42dbe567fb2e2607b22b779851b2ce0eac1b560"},
|
||||
{file = "av-16.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c2cd0de4dd022a7225ff224fde8e7971496d700be41c50adaaa26c07bb50bf97"},
|
||||
{file = "av-16.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0816143530624a5a93bc5494f8c6eeaf77549b9366709c2ac8566c1e9bff6df5"},
|
||||
{file = "av-16.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e3a28053af29644696d0c007e897d19b1197585834660a54773e12a40b16974c"},
|
||||
{file = "av-16.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e3e67144a202b95ed299d165232533989390a9ea3119d37eccec697dc6dbb0c"},
|
||||
{file = "av-16.1.0-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:39a634d8e5a87e78ea80772774bfd20c0721f0d633837ff185f36c9d14ffede4"},
|
||||
{file = "av-16.1.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:0ba32fb9e9300948a7fa9f8a3fc686e6f7f77599a665c71eb2118fdfd2c743f9"},
|
||||
{file = "av-16.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:ca04d17815182d34ce3edc53cbda78a4f36e956c0fd73e3bab249872a831c4d7"},
|
||||
{file = "av-16.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ee0e8de2e124a9ef53c955fe2add6ee7c56cc8fd83318265549e44057db77142"},
|
||||
{file = "av-16.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22bf77a2f658827043a1e184b479c3bf25c4c43ab32353677df2d119f080e28f"},
|
||||
{file = "av-16.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2dd419d262e6a71cab206d80bbf28e0a10d0f227b671cdf5e854c028faa2d043"},
|
||||
{file = "av-16.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:53585986fd431cd436f290fba662cfb44d9494fbc2949a183de00acc5b33fa88"},
|
||||
{file = "av-16.1.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:76f5ed8495cf41e1209a5775d3699dc63fdc1740b94a095e2485f13586593205"},
|
||||
{file = "av-16.1.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:8d55397190f12a1a3ae7538be58c356cceb2bf50df1b33523817587748ce89e5"},
|
||||
{file = "av-16.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:9d51d9037437218261b4bbf9df78a95e216f83d7774fbfe8d289230b5b2e28e2"},
|
||||
{file = "av-16.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0ce07a89c15644407f49d942111ca046e323bbab0a9078ff43ee57c9b4a50dad"},
|
||||
{file = "av-16.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:cac0c074892ea97113b53556ff41c99562db7b9f09f098adac1f08318c2acad5"},
|
||||
{file = "av-16.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7dec3dcbc35a187ce450f65a2e0dda820d5a9e6553eea8344a1459af11c98649"},
|
||||
{file = "av-16.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6f90dc082ff2068ddbe77618400b44d698d25d9c4edac57459e250c16b33d700"},
|
||||
{file = "av-16.1.0.tar.gz", hash = "sha256:a094b4fd87a3721dacf02794d3d2c82b8d712c85b9534437e82a8a978c175ffd"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
{
|
||||
"commit_hash": "ff7eb93f310d36f62b79ff5e229935bf50b934e7",
|
||||
"timestamp": "2026-01-10T02:39:45Z"
|
||||
"commit_hash": "c0e25e54181528c8e0818e2e9bc22fe5a889b8cc",
|
||||
"timestamp": "2026-01-12T02:39:25Z"
|
||||
}
|
||||
|
||||
91
security_scanning/poetry.lock
generated
91
security_scanning/poetry.lock
generated
@ -5540,53 +5540,58 @@ testing = ["datasets", "numpy", "pytest", "pytest-asyncio", "requests", "ruff",
|
||||
|
||||
[[package]]
|
||||
name = "tomli"
|
||||
version = "2.3.0"
|
||||
version = "2.4.0"
|
||||
description = "A lil' TOML parser"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"},
|
||||
{file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"},
|
||||
{file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"},
|
||||
{file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"},
|
||||
{file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"},
|
||||
{file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"},
|
||||
{file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"},
|
||||
{file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"},
|
||||
{file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"},
|
||||
{file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"},
|
||||
{file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"},
|
||||
{file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"},
|
||||
{file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"},
|
||||
{file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"},
|
||||
{file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@ -316,32 +316,50 @@ class ModelConfig(Generic[TConfig]):
|
||||
quant_config = QuantConfig()
|
||||
layer_quant_config = None
|
||||
|
||||
# Read exclude_modules from HF config if present (HF format module names)
|
||||
hf_exclude_modules = hf_quant_config.get('modules_to_not_convert', None)
|
||||
|
||||
# DeepSeek V3 FP8 ckpt
|
||||
if hf_quant_config.get("quant_method") == "fp8" and hf_quant_config.get(
|
||||
"weight_block_size", []):
|
||||
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
|
||||
if moe_backend == 'TRTLLM':
|
||||
# TODO: This is a hack. Remove after fp8 bmm is integrated.
|
||||
quant_config.exclude_modules = [
|
||||
"*kv_b_proj*", "*k_b_proj*", "*eh_proj"
|
||||
]
|
||||
else:
|
||||
quant_config.exclude_modules = ["*eh_proj"]
|
||||
|
||||
block_size = hf_quant_config.get("weight_block_size", [])
|
||||
assert tuple(block_size) == (
|
||||
128, 128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
|
||||
quant_config.group_size = block_size[0]
|
||||
|
||||
# Set default exclude_modules for FP8_BLOCK_SCALES
|
||||
if moe_backend == 'TRTLLM':
|
||||
default_exclude = ["*kv_b_proj*", "*k_b_proj*", "*eh_proj"]
|
||||
else:
|
||||
default_exclude = ["*eh_proj"]
|
||||
|
||||
# Merge HF config's modules_to_not_convert with default exclude_modules
|
||||
if hf_exclude_modules is not None:
|
||||
quant_config.exclude_modules = list(
|
||||
set(hf_exclude_modules + default_exclude))
|
||||
else:
|
||||
quant_config.exclude_modules = default_exclude
|
||||
# MXFP4 checkpoints.
|
||||
elif hf_quant_config.get("quant_method") == "mxfp4":
|
||||
quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo(
|
||||
moe_backend)
|
||||
quant_config.group_size = 32
|
||||
quant_config.exclude_modules = [
|
||||
|
||||
# Default exclude_modules for MXFP4 (TRTLLM internal format)
|
||||
default_exclude = [
|
||||
'block.*.attn.out', 'block.*.mlp.gate', 'block.*.attn.qkv',
|
||||
'embedding', 'unembedding'
|
||||
]
|
||||
|
||||
# Merge HF config's modules_to_not_convert with default exclude_modules
|
||||
if hf_exclude_modules is not None:
|
||||
quant_config.exclude_modules = list(
|
||||
set(hf_exclude_modules + default_exclude))
|
||||
else:
|
||||
quant_config.exclude_modules = default_exclude
|
||||
|
||||
return quant_config, layer_quant_config
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -5,6 +5,7 @@ from .modeling_bert import BertForSequenceClassification
|
||||
from .modeling_clip import CLIPVisionModel
|
||||
from .modeling_deepseekv3 import DeepseekV3ForCausalLM
|
||||
from .modeling_exaone4 import Exaone4ForCausalLM
|
||||
from .modeling_exaone_moe import ExaoneMoeForCausalLM
|
||||
from .modeling_gemma3 import Gemma3ForCausalLM
|
||||
from .modeling_gemma3vl import Gemma3VLM
|
||||
from .modeling_glm import Glm4MoeForCausalLM
|
||||
@ -44,6 +45,7 @@ __all__ = [
|
||||
"CLIPVisionModel",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"Exaone4ForCausalLM",
|
||||
"ExaoneMoeForCausalLM",
|
||||
"Gemma3ForCausalLM",
|
||||
"Gemma3VLM",
|
||||
"HCXVisionForCausalLM",
|
||||
|
||||
581
tensorrt_llm/_torch/models/modeling_exaone_moe.py
Normal file
581
tensorrt_llm/_torch/models/modeling_exaone_moe.py
Normal file
@ -0,0 +1,581 @@
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from tensorrt_llm._ipc_utils import can_access_peer
|
||||
from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention
|
||||
from tensorrt_llm.functional import PositionEmbeddingType
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from ...logger import logger
|
||||
from ..attention_backend import AttentionMetadata
|
||||
from ..attention_backend.interface import (
|
||||
PositionalEmbeddingParams,
|
||||
PredefinedAttentionMask,
|
||||
RopeParams,
|
||||
)
|
||||
from ..distributed import (
|
||||
AllReduce,
|
||||
AllReduceFusionOp,
|
||||
AllReduceParams,
|
||||
MoEAllReduce,
|
||||
MoEAllReduceParams,
|
||||
)
|
||||
from ..model_config import ModelConfig
|
||||
from ..models.modeling_deepseekv3 import Deepseekv3MoE
|
||||
from ..modules.decoder_layer import DecoderLayer
|
||||
from ..modules.embedding import Embedding
|
||||
from ..modules.gated_mlp import GatedMLP
|
||||
from ..modules.linear import TensorParallelMode
|
||||
from ..modules.rms_norm import RMSNorm
|
||||
from ..utils import AuxStreamType, Fp4QuantizedTensor
|
||||
from .modeling_utils import (
|
||||
DecoderModel,
|
||||
DecoderModelForCausalLM,
|
||||
EagerFusionConfig,
|
||||
register_auto_model,
|
||||
)
|
||||
|
||||
# fmt: off
|
||||
# TODO: Remove this once we have a proper transformers package
|
||||
from transformers import AutoConfig, PretrainedConfig # isort: skip
|
||||
|
||||
class ExaoneMoEConfig(PretrainedConfig):
|
||||
model_type = "exaone_moe"
|
||||
|
||||
logger.warning_once(
|
||||
"transformers does not support 'ExaoneMoEConfig'. "
|
||||
"Register ExaoneMoEConfig to mimic the ExaoneMoE model.",
|
||||
key="EXAONE_MOE_REGISTER_WARNING"
|
||||
)
|
||||
AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig)
|
||||
# End of the config register.
|
||||
# fmt: on
|
||||
|
||||
|
||||
def check_is_moe(config: ExaoneMoEConfig, layer_idx: int) -> bool:
|
||||
"""
|
||||
Check if the current layer is a MoE layer.
|
||||
"""
|
||||
return hasattr(config, "is_moe_layer") and config.is_moe_layer[layer_idx]
|
||||
|
||||
|
||||
def enable_attn_allreduce(mapping: Mapping):
|
||||
return not mapping.enable_attention_dp or mapping.has_tp()
|
||||
|
||||
|
||||
class ExaoneMoeAttention(QKNormRoPEAttention):
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig[ExaoneMoEConfig],
|
||||
layer_idx: Optional[int] = None,
|
||||
fuse_qk_norm_rope: bool = False,
|
||||
disable_deep_gemm: bool = False,
|
||||
):
|
||||
config = model_config.pretrained_config
|
||||
|
||||
self.attention_window_size = None
|
||||
self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
|
||||
|
||||
# NOTE: In ExaoneMoe, only sliding layers apply rope.
|
||||
pos_embd_params = None
|
||||
if self.is_sliding:
|
||||
self.attention_window_size = config.sliding_window
|
||||
pos_embd_params = PositionalEmbeddingParams(
|
||||
type=PositionEmbeddingType.rope_gpt_neox,
|
||||
rope=RopeParams.from_config(config),
|
||||
)
|
||||
|
||||
fuse_qk_norm_rope = self.is_sliding and fuse_qk_norm_rope
|
||||
|
||||
# NOTE: Fusing qk norm with rope has an issue that slightly hurts accuracy.
|
||||
assert not fuse_qk_norm_rope, "Fusing qk norm and rope is having issue now"
|
||||
|
||||
super().__init__(
|
||||
hidden_size=config.hidden_size,
|
||||
num_attention_heads=config.num_attention_heads,
|
||||
num_key_value_heads=config.num_key_value_heads,
|
||||
max_position_embeddings=config.max_position_embeddings,
|
||||
bias=False,
|
||||
pos_embd_params=pos_embd_params,
|
||||
fuse_qk_norm_rope=fuse_qk_norm_rope,
|
||||
skip_rope=not self.is_sliding,
|
||||
layer_idx=layer_idx,
|
||||
dtype=config.torch_dtype,
|
||||
config=model_config,
|
||||
disable_deep_gemm=disable_deep_gemm,
|
||||
reduce_output=enable_attn_allreduce(model_config.mapping),
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
position_ids: Optional[torch.LongTensor],
|
||||
hidden_states: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.CAUSAL,
|
||||
lora_params: Optional[dict] = None,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
return super().forward(
|
||||
position_ids=position_ids,
|
||||
hidden_states=hidden_states,
|
||||
attn_metadata=attn_metadata,
|
||||
attention_mask=attention_mask,
|
||||
lora_params=lora_params,
|
||||
attention_window_size=self.attention_window_size,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class ExaoneMoeSparseMoEBlock(Deepseekv3MoE):
|
||||
"""
|
||||
ExaoneMoe Sparse MoE Block Layer.
|
||||
|
||||
It follows DeepSeek-V3 implementation.
|
||||
"""
|
||||
|
||||
|
||||
class ExaoneMoeDecoderLayer(DecoderLayer):
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig[ExaoneMoEConfig],
|
||||
aux_stream_dict: Dict[AuxStreamType, torch.cuda.Stream],
|
||||
layer_idx: int,
|
||||
):
|
||||
super().__init__()
|
||||
self.model_config = model_config
|
||||
config = model_config.pretrained_config
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
self.mapping = model_config.mapping
|
||||
mapping = self.mapping
|
||||
self.enable_attention_dp = mapping.enable_attention_dp
|
||||
self.mlp_tp_size = mapping.tp_size
|
||||
self.is_p2p_supported = can_access_peer(mapping)
|
||||
|
||||
self.fusion_config = EagerFusionConfig()
|
||||
# MoE fusions are disabled by default in K-EXAONE since
|
||||
# it may cause a slight accuracy drop due to numerical gap.
|
||||
self.enable_fusion = os.environ.get("TRTLLM_EXAONE_EAGER_FUSION_ENABLED", "0") == "1"
|
||||
self.enable_fusion &= not self.enable_attention_dp
|
||||
|
||||
# FIXME: incompatible with mixed quantization mode
|
||||
quant_config = self._get_decoder_layer_quant_config(model_config, layer_idx)
|
||||
self.is_nvfp4 = quant_config.layer_quant_mode.has_nvfp4()
|
||||
assert quant_config.quant_algo is not QuantAlgo.MIXED_PRECISION, (
|
||||
"MIXED_PRECISION is ambiguous"
|
||||
)
|
||||
|
||||
self.allreduce = None
|
||||
self.moe_allreduce = None
|
||||
if not self.enable_attention_dp and self.mapping.tp_size > 1:
|
||||
self.allreduce = AllReduce(
|
||||
mapping=model_config.mapping,
|
||||
strategy=model_config.allreduce_strategy,
|
||||
dtype=config.torch_dtype,
|
||||
)
|
||||
self.moe_allreduce = MoEAllReduce(self.mapping)
|
||||
|
||||
has_tp = mapping.has_tp()
|
||||
has_pp = mapping.has_pp()
|
||||
|
||||
# Submodule definitions
|
||||
self.input_layernorm = RMSNorm(
|
||||
hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
|
||||
)
|
||||
|
||||
self.self_attn = ExaoneMoeAttention(model_config, layer_idx=layer_idx)
|
||||
|
||||
# MoE or Dense layer
|
||||
self.is_moe_layer = check_is_moe(config, layer_idx)
|
||||
if self.is_moe_layer:
|
||||
self.fusion_config.PRE_MOE_FUSION = self.enable_fusion and has_tp
|
||||
self.fusion_config.POST_MOE_FUSION = self.fusion_config.PRE_MOE_FUSION and not has_pp
|
||||
self.mlp = ExaoneMoeSparseMoEBlock(
|
||||
num_experts=config.num_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.moe_intermediate_size,
|
||||
shared_expert_intermediate_size=config.moe_intermediate_size
|
||||
* config.num_shared_experts,
|
||||
dtype=config.torch_dtype,
|
||||
model_config=model_config,
|
||||
override_quant_config=quant_config,
|
||||
aux_stream_dict=aux_stream_dict,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
else:
|
||||
block_size = 1
|
||||
if quant_config.quant_algo is None and quant_config.group_size is not None:
|
||||
block_size = quant_config.group_size
|
||||
self.mlp_tp_size = self._compute_mlp_tp_size(config.intermediate_size, block_size)
|
||||
has_mlp_tp = self.mlp_tp_size > 1
|
||||
|
||||
self.fusion_config.PRE_MLP_FUSION = self.enable_fusion and has_mlp_tp and self.is_nvfp4
|
||||
self.fusion_config.POST_MLP_FUSION = self.enable_fusion and has_mlp_tp
|
||||
|
||||
self.mlp = GatedMLP(
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
bias=False,
|
||||
dtype=config.torch_dtype,
|
||||
config=model_config,
|
||||
# Keep sharding consistent with computed mlp_tp_size.
|
||||
# In attention-DP, mlp_tp_size==1 -> disable TP sharding here.
|
||||
overridden_tp_size=self.mlp_tp_size,
|
||||
layer_idx=layer_idx,
|
||||
reduce_output=has_mlp_tp,
|
||||
)
|
||||
|
||||
self.disable_attn_allreduce = (
|
||||
self.fusion_config.PRE_MOE_FUSION
|
||||
or self.fusion_config.PRE_MLP_FUSION
|
||||
or self.mapping.tp_size == 1
|
||||
or self.enable_attention_dp
|
||||
)
|
||||
|
||||
self.post_attention_layernorm = RMSNorm(
|
||||
hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
|
||||
)
|
||||
self.next_layer_layernorm: RMSNorm = None
|
||||
|
||||
def _get_decoder_layer_quant_config(
|
||||
self, model_config: ModelConfig[ExaoneMoEConfig], layer_idx: int
|
||||
):
|
||||
"""
|
||||
The MTP layer in the nvfp4 checkpoint is unquantized. Because the TRTLLM
|
||||
moe_backend only supports fp8/fp4 quantization, we need to override
|
||||
the quant_config for the MTP layer.
|
||||
"""
|
||||
quant_config = model_config.quant_config
|
||||
|
||||
layer_name = f"model.layers.{layer_idx}"
|
||||
if quant_config.is_module_excluded_from_quantization(layer_name):
|
||||
return QuantConfig(
|
||||
quant_algo=None,
|
||||
kv_cache_quant_algo=quant_config.kv_cache_quant_algo,
|
||||
)
|
||||
else:
|
||||
return model_config.quant_config
|
||||
|
||||
def _compute_mlp_tp_size(self, intermediate_size: int, block_size: int) -> int:
|
||||
"""Adopted from DeepseekV3DecoderLayer._compute_mlp_tp_size."""
|
||||
assert intermediate_size % block_size == 0, (
|
||||
f"intermediate_size {intermediate_size} must be divisible by block_size {block_size}."
|
||||
)
|
||||
if self.enable_attention_dp:
|
||||
# If using attention DP, the MLP also uses DP instead of TP.
|
||||
mlp_tp_size = 1
|
||||
else:
|
||||
# The two math.gcd operations ensure that mlp_tp_size falls in the candidate TP sizes.
|
||||
tp = math.gcd(
|
||||
intermediate_size // block_size,
|
||||
self.mapping.tp_size,
|
||||
)
|
||||
|
||||
if tp > self.mapping.gpus_per_node:
|
||||
mlp_tp_size = math.gcd(
|
||||
tp,
|
||||
self.mapping.gpus_per_node,
|
||||
) # Avoid costly inter-node TP
|
||||
else:
|
||||
mlp_tp_size = tp
|
||||
return mlp_tp_size
|
||||
|
||||
def forward(
|
||||
self,
|
||||
position_ids: torch.LongTensor,
|
||||
hidden_states: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
# LN has neem already applied at the previous layer except the first layer.
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
hidden_states = self.self_attn(
|
||||
position_ids=position_ids,
|
||||
hidden_states=hidden_states,
|
||||
attn_metadata=attn_metadata,
|
||||
all_reduce_params=AllReduceParams(enable_allreduce=not (self.disable_attn_allreduce)),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if self.is_moe_layer:
|
||||
hidden_states, residual = self.forward_moe(
|
||||
hidden_states=hidden_states,
|
||||
attn_metadata=attn_metadata,
|
||||
residual=residual,
|
||||
)
|
||||
else:
|
||||
hidden_states, residual = self.forward_mlp(
|
||||
hidden_states=hidden_states,
|
||||
residual=residual,
|
||||
)
|
||||
|
||||
return hidden_states, residual
|
||||
|
||||
def forward_moe(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
residual: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
def _run_moe(hidden_states, hidden_states_fp4, do_finalize):
|
||||
return self.mlp(
|
||||
hidden_states,
|
||||
hidden_states_fp4,
|
||||
all_rank_num_tokens=attn_metadata.all_rank_num_tokens,
|
||||
final_all_reduce_params=AllReduceParams(
|
||||
enable_allreduce=not (
|
||||
self.fusion_config.POST_MOE_FUSION or self.mapping.tp_size == 1
|
||||
)
|
||||
),
|
||||
do_finalize=do_finalize,
|
||||
)
|
||||
|
||||
if self.fusion_config.PRE_MOE_FUSION:
|
||||
# moe_backend can be either CUTLASS or TRTLLM here
|
||||
hidden_states, residual = self.allreduce(
|
||||
hidden_states,
|
||||
all_reduce_params=AllReduceParams(
|
||||
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
|
||||
residual=residual,
|
||||
norm_weight=self.post_attention_layernorm.weight,
|
||||
eps=self.post_attention_layernorm.variance_epsilon,
|
||||
trigger_completion_at_end=False,
|
||||
),
|
||||
)
|
||||
else:
|
||||
# No fusion
|
||||
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
||||
|
||||
# Note: this fusion pattern is only supported for single-node TRTLLM-nvfp4 backend now
|
||||
do_finalize = self.mapping.is_multi_node() or (
|
||||
not (
|
||||
self.fusion_config.POST_MOE_FUSION
|
||||
and hidden_states.shape[0] <= self.moe_allreduce.max_token
|
||||
and self.model_config.moe_backend == "TRTLLM"
|
||||
and self.mlp.experts.has_nvfp4
|
||||
and self.is_p2p_supported
|
||||
)
|
||||
)
|
||||
|
||||
hidden_states = _run_moe(hidden_states, hidden_states_fp4=None, do_finalize=do_finalize)
|
||||
|
||||
if self.fusion_config.POST_MOE_FUSION:
|
||||
if do_finalize:
|
||||
hidden_states, residual = self.allreduce(
|
||||
hidden_states,
|
||||
all_reduce_params=AllReduceParams(
|
||||
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
|
||||
residual=residual,
|
||||
norm_weight=self.next_layer_layernorm.weight,
|
||||
eps=self.next_layer_layernorm.variance_epsilon,
|
||||
trigger_completion_at_end=False,
|
||||
),
|
||||
)
|
||||
else:
|
||||
assert len(hidden_states) == 4, "hidden_states must have 4 elements"
|
||||
|
||||
shared_output = hidden_states[0]
|
||||
fc2_output = hidden_states[1]
|
||||
expert_scale_factor = hidden_states[2]
|
||||
expanded_idx_to_permuted_idx = hidden_states[3]
|
||||
|
||||
moe_all_reduce_params = MoEAllReduceParams(
|
||||
expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
|
||||
expert_scale_factor=expert_scale_factor,
|
||||
shared_expert_output=shared_output,
|
||||
residual=residual,
|
||||
norm_weight=self.next_layer_layernorm.weight,
|
||||
eps=self.next_layer_layernorm.variance_epsilon,
|
||||
is_cutlass_min_latency=False,
|
||||
)
|
||||
hidden_states, residual = self.moe_allreduce(
|
||||
fc2_output, all_reduce_params=moe_all_reduce_params
|
||||
)
|
||||
elif self.next_layer_layernorm is not None:
|
||||
hidden_states, residual = self.next_layer_layernorm(hidden_states, residual)
|
||||
|
||||
return hidden_states, residual
|
||||
|
||||
def forward_mlp(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if self.fusion_config.PRE_MLP_FUSION:
|
||||
act_fp4, act_sf, residual = self.allreduce(
|
||||
hidden_states,
|
||||
all_reduce_params=AllReduceParams(
|
||||
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4,
|
||||
residual=residual,
|
||||
norm_weight=self.post_attention_layernorm.weight,
|
||||
scale=self.mlp.gate_up_proj.input_scale,
|
||||
eps=self.post_attention_layernorm.variance_epsilon,
|
||||
),
|
||||
)
|
||||
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
|
||||
else:
|
||||
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
||||
|
||||
hidden_states = self.mlp(
|
||||
hidden_states,
|
||||
final_all_reduce_params=AllReduceParams(
|
||||
enable_allreduce=not (self.fusion_config.POST_MLP_FUSION or self.mlp_tp_size == 1)
|
||||
),
|
||||
)
|
||||
|
||||
if self.fusion_config.POST_MLP_FUSION:
|
||||
hidden_states, residual = self.allreduce(
|
||||
hidden_states,
|
||||
all_reduce_params=AllReduceParams(
|
||||
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
|
||||
residual=residual,
|
||||
norm_weight=self.next_layer_layernorm.weight,
|
||||
eps=self.next_layer_layernorm.variance_epsilon,
|
||||
),
|
||||
)
|
||||
elif self.next_layer_layernorm is not None:
|
||||
hidden_states, residual = self.next_layer_layernorm(hidden_states, residual)
|
||||
|
||||
return hidden_states, residual
|
||||
|
||||
|
||||
class ExaoneMoeModel(DecoderModel):
|
||||
def __init__(self, model_config: ModelConfig[ExaoneMoEConfig]):
|
||||
super().__init__(model_config)
|
||||
config = self.model_config.pretrained_config
|
||||
self.num_hidden_layers = config.num_hidden_layers
|
||||
self.embed_tokens = Embedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
dtype=config.torch_dtype,
|
||||
mapping=model_config.mapping,
|
||||
tensor_parallel_mode=TensorParallelMode.COLUMN,
|
||||
gather_output=True,
|
||||
)
|
||||
|
||||
aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
|
||||
self.aux_stream_dict = {
|
||||
AuxStreamType.Attention: aux_stream_list[0],
|
||||
AuxStreamType.MoeShared: aux_stream_list[0],
|
||||
AuxStreamType.MoeChunkingOverlap: aux_stream_list[1],
|
||||
AuxStreamType.MoeBalancer: aux_stream_list[2],
|
||||
}
|
||||
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
ExaoneMoeDecoderLayer(
|
||||
model_config=model_config,
|
||||
aux_stream_dict=self.aux_stream_dict,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
for layer_idx in range(self.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
|
||||
self.norm = RMSNorm(
|
||||
hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
attn_metadata: AttentionMetadata,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
lora_params=None,
|
||||
**kwargs,
|
||||
) -> torch.Tensor | Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError(
|
||||
"You cannot specify both input_ids and inputs_embeds at "
|
||||
"the same time, and must specify either one."
|
||||
)
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
hidden_states = inputs_embeds.to(self.dtype)
|
||||
residual = None
|
||||
|
||||
for decoder_layer in self.layers[: self.num_hidden_layers]:
|
||||
hidden_states, residual = decoder_layer(
|
||||
position_ids=position_ids,
|
||||
hidden_states=hidden_states,
|
||||
attn_metadata=attn_metadata,
|
||||
residual=residual,
|
||||
lora_params=lora_params,
|
||||
)
|
||||
# The last LN already has been applied as a part of fusion.
|
||||
return hidden_states
|
||||
|
||||
|
||||
@register_auto_model("ExaoneMoEForCausalLM")
|
||||
class ExaoneMoeForCausalLM(DecoderModelForCausalLM[ExaoneMoeModel, ExaoneMoEConfig]):
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig[ExaoneMoEConfig],
|
||||
):
|
||||
super().__init__(
|
||||
ExaoneMoeModel(model_config),
|
||||
config=model_config,
|
||||
hidden_size=model_config.pretrained_config.hidden_size,
|
||||
vocab_size=model_config.pretrained_config.vocab_size,
|
||||
)
|
||||
|
||||
def load_weights(
|
||||
self,
|
||||
weights: Dict,
|
||||
weight_mapper: Optional["BaseWeightMapper"] = None, # noqa: F821
|
||||
skip_modules: Optional[List[str]] = None,
|
||||
allow_partial_loading: bool = False,
|
||||
):
|
||||
# MoE naming pattern.
|
||||
moe_weight_patterns = {
|
||||
"gate_proj": "w1",
|
||||
"up_proj": "w3",
|
||||
"down_proj": "w2",
|
||||
}
|
||||
|
||||
module_names = list(weights)
|
||||
for name in module_names:
|
||||
if "mlp.e_score_correction_bias" in name:
|
||||
# Move bias into the gate module.
|
||||
new_name = name.replace(
|
||||
"mlp.e_score_correction_bias", "mlp.gate.e_score_correction_bias"
|
||||
)
|
||||
else:
|
||||
# MoE Weight Remapping.
|
||||
new_name = name
|
||||
for k, v in moe_weight_patterns.items():
|
||||
pattern = rf"(experts\.\d+\.){k}\b"
|
||||
new_name = re.sub(pattern, rf"\1{v}", new_name)
|
||||
|
||||
# Remap the name-parameter pair if needed.
|
||||
if new_name != name:
|
||||
weights[new_name] = weights.pop(name)
|
||||
|
||||
super().load_weights(
|
||||
weights=weights,
|
||||
weight_mapper=weight_mapper,
|
||||
skip_modules=skip_modules or [],
|
||||
allow_partial_loading=allow_partial_loading,
|
||||
)
|
||||
|
||||
def post_load_weights(self):
|
||||
# For the cross-layer residual+LN fusion.
|
||||
for idx, layer in enumerate(self.model.layers[: self.config.num_hidden_layers]):
|
||||
if idx == self.config.num_hidden_layers - 1:
|
||||
layer.next_layer_layernorm = self.model.norm
|
||||
else:
|
||||
layer.next_layer_layernorm = self.model.layers[idx + 1].input_layernorm
|
||||
@ -32,6 +32,7 @@ from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm._torch.expert_statistic import ExpertStatistic
|
||||
from tensorrt_llm._torch.model_config import ModelConfig
|
||||
from tensorrt_llm._torch.modules.fused_moe.interface import MoE
|
||||
from tensorrt_llm._torch.modules.fused_moe.routing import BaseMoeRoutingMethod
|
||||
@ -619,6 +620,10 @@ class ConfigurableMoE(MoE):
|
||||
else:
|
||||
token_selected_slots = token_selected_experts
|
||||
|
||||
if token_selected_slots is not None:
|
||||
ExpertStatistic.set_layer(self.layer_idx)
|
||||
ExpertStatistic.maybe_add_info(self.num_slots, token_selected_slots)
|
||||
|
||||
# ========== Step 3.5: Communication Prepare Phase (BEFORE quantization) ==========
|
||||
# NVLINK two-sided has a prepare phase to gather EPLB statistics
|
||||
|
||||
@ -647,6 +652,10 @@ class ConfigurableMoE(MoE):
|
||||
# supports_post_quant_dispatch checks strategy capability for the current quant mode
|
||||
supports_post_quant = self.comm.supports_post_quant_dispatch()
|
||||
|
||||
# Call dummy_allreduce before allgather for load balancing debug
|
||||
if self.enable_dummy_allreduce:
|
||||
self.dummy_allreduce()
|
||||
|
||||
if supports_post_quant:
|
||||
# ===== Post-quant flow: Quantize → Dispatch =====
|
||||
|
||||
@ -710,6 +719,8 @@ class ConfigurableMoE(MoE):
|
||||
|
||||
# ========== Step 9: Communication - Combine ==========
|
||||
if self.comm is not None:
|
||||
if self.enable_dummy_allreduce:
|
||||
self.dummy_allreduce()
|
||||
# Use unified combine interface (reads dispatch state from strategy)
|
||||
final_hidden_states = self.comm.combine(final_hidden_states)
|
||||
else:
|
||||
|
||||
@ -159,10 +159,6 @@ class WideEPMoE(MoE):
|
||||
if not model_config.skip_create_weights_in_init:
|
||||
self.create_weights()
|
||||
|
||||
# Debug function for eliminating imbalance during performance analysis.
|
||||
self.enable_dummy_allreduce = os.environ.get(
|
||||
"TRTLLM_ENABLE_DUMMY_ALLREDUCE", "0") == "1"
|
||||
|
||||
# MoE op will be lazily initialized when first accessed (see moe_op_impl property)
|
||||
self._moe_op_impl = None
|
||||
|
||||
@ -342,16 +338,6 @@ class WideEPMoE(MoE):
|
||||
self._moe_op_impl = MoEOpSelector.select_op(self)
|
||||
return self._moe_op_impl
|
||||
|
||||
def dummy_allreduce(self):
|
||||
"""
|
||||
Debug function for eliminating imbalance during performance analysis.
|
||||
Creates a small dummy tensor and performs allreduce to synchronize processes
|
||||
and eliminate timing imbalances for more accurate profiling measurements.
|
||||
"""
|
||||
dummy_tensor = torch.zeros(4, dtype=torch.float32, device='cuda')
|
||||
dummy_tensor = self.all_reduce(dummy_tensor)
|
||||
return dummy_tensor
|
||||
|
||||
def reducescatter_or_allreduce(
|
||||
self,
|
||||
inputs,
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import weakref
|
||||
from abc import abstractmethod
|
||||
from enum import Enum, IntEnum
|
||||
@ -200,11 +201,19 @@ class MoE(nn.Module):
|
||||
self.intermediate_size_per_partition = intermediate_size // self.tp_size
|
||||
|
||||
self.all_reduce = None
|
||||
# Debug function for eliminating imbalance during performance analysis.
|
||||
self.enable_dummy_allreduce = os.environ.get(
|
||||
"TRTLLM_ENABLE_DUMMY_ALLREDUCE", "0") == "1"
|
||||
if not self.use_dp and self.mapping.tp_size > 1:
|
||||
self.all_reduce = AllReduce(
|
||||
mapping=self.mapping,
|
||||
strategy=model_config.allreduce_strategy,
|
||||
dtype=self.dtype)
|
||||
elif self.enable_dummy_allreduce:
|
||||
from tensorrt_llm.functional import AllReduceStrategy
|
||||
self.all_reduce = AllReduce(mapping=self.mapping,
|
||||
strategy=AllReduceStrategy.NCCL,
|
||||
dtype=self.dtype)
|
||||
|
||||
# Initialize load balancer related attributes
|
||||
if init_load_balancer:
|
||||
@ -748,3 +757,14 @@ class MoE(nn.Module):
|
||||
elif self.reduce_results:
|
||||
outputs = self.all_reduce(inputs)
|
||||
return outputs
|
||||
|
||||
def dummy_allreduce(self):
|
||||
assert self.enable_dummy_allreduce and self.all_reduce is not None, "Dummy allreduce is not enabled"
|
||||
"""
|
||||
Debug function for eliminating imbalance during performance analysis.
|
||||
Creates a small dummy tensor and performs allreduce to synchronize processes
|
||||
and eliminate timing imbalances for more accurate profiling measurements.
|
||||
"""
|
||||
dummy_tensor = torch.zeros(4, dtype=torch.float32, device="cuda")
|
||||
dummy_tensor = self.all_reduce(dummy_tensor)
|
||||
return dummy_tensor
|
||||
|
||||
@ -257,21 +257,33 @@ class Deepseekv3RoutingImpl:
|
||||
if self.n_group > 1:
|
||||
if self.top_k > 8 or (num_experts / n_group) > 32 or (
|
||||
num_experts / n_group) * self.topk_group > 128:
|
||||
if (self.is_fused):
|
||||
if self.is_fused:
|
||||
warnings.warn(
|
||||
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
|
||||
)
|
||||
self.is_fused = False
|
||||
else:
|
||||
elif (num_experts > 512 or (self.top_k > 8 and self.top_k != 22)
|
||||
or self.topk_group == 1):
|
||||
# We have special implementation for n_group == 1, top_k == 22 and num_experts == 512 for Nemotron Super v3.
|
||||
if num_experts > 512 or (self.top_k > 8 and self.top_k != 22):
|
||||
if (self.is_fused):
|
||||
warnings.warn(
|
||||
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
|
||||
)
|
||||
self.is_fused = False
|
||||
if self.is_fused:
|
||||
warnings.warn(
|
||||
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
|
||||
)
|
||||
self.is_fused = False
|
||||
|
||||
if not self.is_fused:
|
||||
if self.n_group == 1 and self.topk_group == 1:
|
||||
scores, scores_with_bias = self.get_scores(logits,
|
||||
e_score_correction_bias)
|
||||
_, topk_indices = torch.topk(scores_with_bias, k=self.top_k, dim=1)
|
||||
topk_values = torch.gather(scores, dim=1,
|
||||
index=topk_indices).type_as(scores)
|
||||
|
||||
# Normalize and scale.
|
||||
topk_values_sum = torch.sum(topk_values, dim=-1,
|
||||
keepdim=True) + 1e-20
|
||||
topk_values = topk_values / topk_values_sum * self.routed_scaling_factor
|
||||
return topk_values, topk_indices
|
||||
elif not self.is_fused:
|
||||
scores, scores_with_bias = self.get_scores(logits,
|
||||
e_score_correction_bias)
|
||||
scores_shape = list(scores_with_bias.shape)
|
||||
|
||||
@ -1167,7 +1167,8 @@ class PyExecutor:
|
||||
for req in previous_batch.scheduled_ctx_reqs:
|
||||
if req.is_context_only_request and (
|
||||
req.is_context_finished
|
||||
or req.is_finished_due_to_length):
|
||||
or req.is_finished_due_to_length
|
||||
) and not req.is_finished_due_to_cancellation:
|
||||
block_id = self.kv_cache_manager.store_blocks_for_reuse(
|
||||
req, True)
|
||||
self.ctx_in_transmission_requests[
|
||||
@ -1436,7 +1437,8 @@ class PyExecutor:
|
||||
for req in scheduled_batch.context_requests:
|
||||
if req.is_context_only_request and (
|
||||
req.is_context_finished
|
||||
or req.is_finished_due_to_length):
|
||||
or req.is_finished_due_to_length
|
||||
) and not req.is_finished_due_to_cancellation:
|
||||
block_id = self.kv_cache_manager.store_blocks_for_reuse(
|
||||
req, True)
|
||||
self.ctx_in_transmission_requests[
|
||||
@ -1686,7 +1688,8 @@ class PyExecutor:
|
||||
for req in self.previous_batch.sample_state.scheduled_requests.context_requests:
|
||||
if req.is_context_only_request and (
|
||||
req.is_context_finished
|
||||
or req.is_finished_due_to_length):
|
||||
or req.is_finished_due_to_length
|
||||
) and not req.is_finished_due_to_cancellation:
|
||||
block_id = self.kv_cache_manager.store_blocks_for_reuse(
|
||||
req, True)
|
||||
self.ctx_in_transmission_requests[
|
||||
@ -2196,8 +2199,9 @@ class PyExecutor:
|
||||
if (scheduled_ctx_requests is None or len(scheduled_ctx_requests) == 0):
|
||||
return []
|
||||
for req in scheduled_ctx_requests:
|
||||
if req.is_context_only_request and (req.is_context_finished or
|
||||
req.is_finished_due_to_length):
|
||||
if req.is_context_only_request and (
|
||||
req.is_context_finished or req.is_finished_due_to_length
|
||||
) and not req.is_finished_due_to_cancellation:
|
||||
self.kv_cache_transceiver.respond_and_send_async(req)
|
||||
for resource_mgr_type in (
|
||||
ResourceManagerType.SEQ_SLOT_MANAGER,
|
||||
|
||||
@ -1431,7 +1431,8 @@ class ResourceManager:
|
||||
resource_manager.update_resources(scheduled_batch)
|
||||
|
||||
def free_resources(self, request: LlmRequest):
|
||||
for _, resource_manager in reversed(self.resource_managers.items()):
|
||||
for resource_type, resource_manager in reversed(
|
||||
self.resource_managers.items()):
|
||||
if hasattr(resource_manager, "free_resources"):
|
||||
resource_manager.free_resources(request)
|
||||
|
||||
|
||||
@ -560,7 +560,7 @@ class ReportUtility:
|
||||
else:
|
||||
backend_info = (
|
||||
"\n\n===========================================================\n"
|
||||
"= PYTORCH BACKEND\n"
|
||||
f"= {self.rt_cfg.backend.upper()} BACKEND\n"
|
||||
"===========================================================\n"
|
||||
f"Model:\t\t\t{engine['model']}\n"
|
||||
f"Model Path:\t\t{engine['model_path']}\n"
|
||||
|
||||
@ -207,7 +207,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
kwargs = self.get_default_kwargs()
|
||||
# TODO: multi-stream MOE seems to increase the memory usage
|
||||
kwargs["max_batch_size"] = 32
|
||||
kwargs["free_mem_ratio"] = 0.5
|
||||
kwargs["free_mem_ratio"] = 0.4
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
|
||||
tokenizer=self.MODEL_PATH_BF16,
|
||||
@ -226,9 +226,9 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
# Manually set quant_config for FP8 model to get the accuracy threshold
|
||||
llm.args.quant_config.quant_algo = QuantAlgo.FP8
|
||||
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
||||
|
||||
# task = MMLU(self.MODEL_NAME)
|
||||
# task.evaluate(llm, sampling_params=sampling_params)
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=sampling_params)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@ -260,6 +260,7 @@ class TestQwen3VL_MOE(LlmapiAccuracyTestHarness):
|
||||
max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<|endoftext|>"
|
||||
)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(140000)
|
||||
def test_auto_dtype(self):
|
||||
with LLM(
|
||||
self.MODEL_PATH,
|
||||
|
||||
@ -0,0 +1,44 @@
|
||||
hostname: localhost
|
||||
port: 8000
|
||||
model: DeepSeek-V3-Lite/bf16
|
||||
backend: "pytorch"
|
||||
enable_autotuner: False
|
||||
context_servers:
|
||||
disable_overlap_scheduler: True
|
||||
num_instances: 1
|
||||
tensor_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
max_num_tokens: 16384
|
||||
max_seq_len: 32768
|
||||
enable_chunked_prefill: True
|
||||
kv_cache_config:
|
||||
enable_block_reuse: True
|
||||
enable_partial_reuse: True
|
||||
free_gpu_memory_fraction: 0.3
|
||||
cache_transceiver_config:
|
||||
backend: "DEFAULT"
|
||||
max_tokens_in_buffer: 32768
|
||||
cuda_graph_config:
|
||||
enable_padding: True
|
||||
max_batch_size: 1
|
||||
urls:
|
||||
- "localhost:8001"
|
||||
generation_servers:
|
||||
num_instances: 1
|
||||
tensor_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
max_num_tokens: 2048
|
||||
max_seq_len: 32768
|
||||
enable_chunked_prefill: True
|
||||
kv_cache_config:
|
||||
enable_block_reuse: True
|
||||
enable_partial_reuse: True
|
||||
free_gpu_memory_fraction: 0.85
|
||||
cache_transceiver_config:
|
||||
backend: "DEFAULT"
|
||||
max_tokens_in_buffer: 32768
|
||||
cuda_graph_config:
|
||||
enable_padding: True
|
||||
max_batch_size: 64
|
||||
urls:
|
||||
- "localhost:8002"
|
||||
@ -0,0 +1,44 @@
|
||||
hostname: localhost
|
||||
port: 8000
|
||||
model: DeepSeek-V3-0324-FP4
|
||||
backend: "pytorch"
|
||||
enable_autotuner: False
|
||||
context_servers:
|
||||
disable_overlap_scheduler: True
|
||||
num_instances: 1
|
||||
tensor_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_num_tokens: 12000
|
||||
max_seq_len: 262144
|
||||
enable_chunked_prefill: True
|
||||
kv_cache_config:
|
||||
enable_block_reuse: True
|
||||
enable_partial_reuse: True
|
||||
free_gpu_memory_fraction: 0.2
|
||||
cache_transceiver_config:
|
||||
backend: "DEFAULT"
|
||||
max_tokens_in_buffer: 262144
|
||||
cuda_graph_config:
|
||||
enable_padding: True
|
||||
max_batch_size: 1
|
||||
urls:
|
||||
- "localhost:8001"
|
||||
generation_servers:
|
||||
num_instances: 1
|
||||
tensor_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_num_tokens: 2048
|
||||
max_seq_len: 262144
|
||||
enable_chunked_prefill: True
|
||||
kv_cache_config:
|
||||
enable_block_reuse: True
|
||||
enable_partial_reuse: True
|
||||
free_gpu_memory_fraction: 0.3
|
||||
cache_transceiver_config:
|
||||
backend: "DEFAULT"
|
||||
max_tokens_in_buffer: 262144
|
||||
cuda_graph_config:
|
||||
enable_padding: True
|
||||
max_batch_size: 11
|
||||
urls:
|
||||
- "localhost:8002"
|
||||
@ -200,6 +200,10 @@ def get_test_config(test_desc, example_dir, test_root):
|
||||
"gpt_oss_120b_stress":
|
||||
(4,
|
||||
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml"),
|
||||
"cancel_stress_test":
|
||||
(2, f"{test_configs_root}/disagg_config_cancel_stress_test.yaml"),
|
||||
"cancel_stress_test_large":
|
||||
(8, f"{test_configs_root}/disagg_config_cancel_stress_test_large.yaml"),
|
||||
}
|
||||
|
||||
if test_desc not in config_map:
|
||||
@ -2098,3 +2102,211 @@ def test_disaggregated_stress_test(disaggregated_test_root,
|
||||
threshold=test_config.accuracy_threshold,
|
||||
env=llm_venv._new_env,
|
||||
cwd=llm_venv.get_working_directory())
|
||||
|
||||
|
||||
def run_cancel_stress_test(server_url: str,
|
||||
num_bursts: int = 5,
|
||||
requests_per_burst: int = 32,
|
||||
prompt_len_range: tuple = (2000, 8000),
|
||||
cancel_after_range: tuple = (0.01, 0.1)):
|
||||
"""
|
||||
Stress test that sends requests with large contexts and cancels them
|
||||
during prefill to test resource cleanup under cancellation.
|
||||
|
||||
Args:
|
||||
server_url: The server URL (e.g., "http://localhost:8000")
|
||||
num_bursts: Number of request bursts to send
|
||||
requests_per_burst: Number of concurrent requests per burst
|
||||
prompt_len_range: (min, max) prompt length in tokens
|
||||
cancel_after_range: (min, max) seconds to wait before cancelling
|
||||
"""
|
||||
import asyncio
|
||||
import random
|
||||
import time
|
||||
|
||||
import aiohttp
|
||||
|
||||
async def spam_and_cancel(session, req_id, url, prompt_len_range,
|
||||
cancel_after_range):
|
||||
"""Send a request and cancel it during prefill."""
|
||||
prompt_len = random.randint(prompt_len_range[0], prompt_len_range[1])
|
||||
prompt = "test " * (prompt_len // 5)
|
||||
|
||||
payload = {
|
||||
"model": "test-model",
|
||||
"prompt": prompt,
|
||||
"max_tokens": 10,
|
||||
"stream": True
|
||||
}
|
||||
|
||||
try:
|
||||
cancel_after = random.uniform(cancel_after_range[0],
|
||||
cancel_after_range[1])
|
||||
start = time.time()
|
||||
async with session.post(
|
||||
f"{url}/v1/completions",
|
||||
json=payload,
|
||||
timeout=aiohttp.ClientTimeout(total=60)) as resp:
|
||||
async for line in resp.content:
|
||||
if time.time() - start > cancel_after:
|
||||
# Force disconnect during prefill
|
||||
break
|
||||
except Exception:
|
||||
pass # Connection abort is expected
|
||||
|
||||
async def run_bursts():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for burst_idx in range(num_bursts):
|
||||
tasks = [
|
||||
spam_and_cancel(session, i, server_url, prompt_len_range,
|
||||
cancel_after_range)
|
||||
for i in range(requests_per_burst)
|
||||
]
|
||||
await asyncio.gather(*tasks)
|
||||
logger.info(
|
||||
f"Completed burst {burst_idx + 1}/{num_bursts} ({requests_per_burst} requests)"
|
||||
)
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
asyncio.run(run_bursts())
|
||||
|
||||
|
||||
def run_disaggregated_cancel_test(example_dir,
|
||||
test_desc,
|
||||
env=None,
|
||||
cwd=None,
|
||||
num_bursts=64,
|
||||
requests_per_burst=64):
|
||||
"""Run disaggregated test with request cancellation stress test."""
|
||||
cleanup_output_files()
|
||||
run_env = env.copy()
|
||||
run_env["UCX_TLS"] = "^ib"
|
||||
|
||||
num_ranks, config_file = get_test_config(test_desc, example_dir,
|
||||
os.path.dirname(__file__))
|
||||
|
||||
workers_cmd = [
|
||||
'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
|
||||
str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
|
||||
config_file
|
||||
]
|
||||
|
||||
server_start_timeout = 1200
|
||||
server_cmd = [
|
||||
'trtllm-serve', 'disaggregated', '--server_start_timeout',
|
||||
str(server_start_timeout), '-c', config_file
|
||||
]
|
||||
server_host, server_port = get_disagg_server_url_from_cfg(config_file)
|
||||
server_url = f"http://{server_host}:{server_port}"
|
||||
|
||||
try:
|
||||
with (open('output_workers.log', 'w') as output_workers,
|
||||
popen(workers_cmd,
|
||||
stdout=output_workers,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=run_env,
|
||||
cwd=cwd) as workers_proc, open('output_disagg.log', 'w') as
|
||||
output_disagg,
|
||||
popen(server_cmd,
|
||||
stdout=output_disagg,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=run_env,
|
||||
cwd=cwd) as server_proc):
|
||||
|
||||
# Wait for server to be ready
|
||||
if not wait_for_server(server_host,
|
||||
server_port,
|
||||
timeout_seconds=server_start_timeout):
|
||||
raise RuntimeError(
|
||||
f"Disaggregated server did not become ready within {server_start_timeout} seconds"
|
||||
)
|
||||
|
||||
# Run the cancel stress test
|
||||
run_cancel_stress_test(server_url,
|
||||
num_bursts=num_bursts,
|
||||
requests_per_burst=requests_per_burst)
|
||||
|
||||
# Verify server is still healthy after stress test by sending a normal request
|
||||
client_dir = f"{example_dir}/clients"
|
||||
client_cmd = [
|
||||
'python3', f'{client_dir}/disagg_client.py', '-c', config_file,
|
||||
'-p', f'{client_dir}/prompts.json', '--ignore-eos',
|
||||
'--server-start-timeout',
|
||||
str(server_start_timeout)
|
||||
]
|
||||
check_call(client_cmd,
|
||||
env=env,
|
||||
poll_procs=[workers_proc, server_proc])
|
||||
|
||||
except Exception:
|
||||
logger.error("-------- Workers output --------")
|
||||
with open('output_workers.log', 'r') as f:
|
||||
logger.error(f.read())
|
||||
|
||||
logger.error("-------- Disagg server output --------")
|
||||
with open('output_disagg.log', 'r') as f:
|
||||
logger.error(f.read())
|
||||
raise
|
||||
finally:
|
||||
if 'server_proc' in locals() and 'workers_proc' in locals():
|
||||
server_proc.terminate()
|
||||
workers_proc.terminate()
|
||||
server_proc.wait()
|
||||
workers_proc.wait()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'],
|
||||
indirect=True)
|
||||
def test_disaggregated_cancel_large_context_requests(disaggregated_test_root,
|
||||
disaggregated_example_root,
|
||||
llm_venv,
|
||||
deepseek_v3_model_root):
|
||||
"""
|
||||
Test that the disaggregated server handles request cancellations gracefully.
|
||||
|
||||
This test sends bursts of requests with large contexts and cancels them
|
||||
during prefill to stress test resource cleanup.
|
||||
"""
|
||||
src_dst_dict = {
|
||||
deepseek_v3_model_root:
|
||||
f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16",
|
||||
}
|
||||
for src, dst in src_dst_dict.items():
|
||||
if not os.path.islink(dst):
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
os.symlink(src, dst, target_is_directory=True)
|
||||
|
||||
run_disaggregated_cancel_test(disaggregated_example_root,
|
||||
"cancel_stress_test",
|
||||
env=llm_venv._new_env,
|
||||
cwd=llm_venv.get_working_directory(),
|
||||
num_bursts=5,
|
||||
requests_per_burst=32)
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@skip_pre_blackwell
|
||||
@pytest.mark.parametrize("model_path", ['DeepSeek-V3-0324-FP4'])
|
||||
def test_disaggregated_cancel_large_context_requests_long(
|
||||
disaggregated_test_root, disaggregated_example_root, llm_venv,
|
||||
model_path):
|
||||
"""Test that disaggregated server handles request cancellations gracefully.
|
||||
|
||||
This test sends bursts of requests with large contexts and cancels them
|
||||
during prefill to stress test resource cleanup.
|
||||
"""
|
||||
model_dir = f"{llm_models_root()}/{model_path}"
|
||||
src_dst_dict = {
|
||||
model_dir: f"{llm_venv.get_working_directory()}/{model_path}",
|
||||
}
|
||||
for src, dst in src_dst_dict.items():
|
||||
if not os.path.islink(dst):
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
os.symlink(src, dst, target_is_directory=True)
|
||||
|
||||
run_disaggregated_cancel_test(disaggregated_example_root,
|
||||
"cancel_stress_test_large",
|
||||
env=llm_venv._new_env,
|
||||
cwd=llm_venv.get_working_directory(),
|
||||
num_bursts=1000,
|
||||
requests_per_burst=32)
|
||||
|
||||
@ -3,15 +3,7 @@ network_name,perf_case_name,test_name,threshold,absolute_threshold,metric_type,p
|
||||
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,76.45,
|
||||
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,9785.75,
|
||||
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,55.64,
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,171845.02,H100_PCIe
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.17,H100_PCIe
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,48.09,H100_PCIe
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,6155.59,H100_PCIe
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,139897.82,H100_NVL
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,69.59,H100_NVL
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,58.63,H100_NVL
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,7504.07,H100_NVL
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,125068.76,H100
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.09,H100
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,65.50,H100
|
||||
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,8384.00,H100
|
||||
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",0.1,50,INFERENCE_TIME,1359184.5059,H100_PCIe
|
||||
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,50,KV_CACHE_SIZE,10.92,H100_PCIe
|
||||
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,10,SEQ_THROUGHPUT,0.3767,H100_PCIe
|
||||
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,10,TOKEN_THROUGHPUT,385.7372,H100_PCIe
|
||||
|
||||
|
108
tests/integration/defs/perf/disagg/cleanup_jobs.sh
Normal file
108
tests/integration/defs/perf/disagg/cleanup_jobs.sh
Normal file
@ -0,0 +1,108 @@
|
||||
#!/bin/bash
|
||||
# cleanup_jobs.sh - Cancel all SLURM jobs tracked in jobs.txt
|
||||
#
|
||||
# This script is designed to run in GitLab CI after_script to ensure
|
||||
# all SLURM jobs are cancelled when the pipeline is interrupted, cancelled,
|
||||
# or times out.
|
||||
#
|
||||
# Usage:
|
||||
# bash cleanup_jobs.sh
|
||||
#
|
||||
# Environment variables:
|
||||
# OUTPUT_PATH: Directory containing jobs.txt and pytest.pid
|
||||
|
||||
set -e
|
||||
|
||||
OUTPUT_PATH="${OUTPUT_PATH:-/tmp}"
|
||||
JOBS_FILE="${OUTPUT_PATH}/jobs.txt"
|
||||
PID_FILE="${OUTPUT_PATH}/pytest.pid"
|
||||
|
||||
echo "=========================================="
|
||||
echo "SLURM Job Cleanup Script"
|
||||
echo "=========================================="
|
||||
echo "Output path: $OUTPUT_PATH"
|
||||
echo ""
|
||||
|
||||
# Show pytest PID if available (for debugging)
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
|
||||
echo "Pytest PID: $PYTEST_PID"
|
||||
|
||||
# Check if pytest is still running
|
||||
if kill -0 "$PYTEST_PID" 2>/dev/null; then
|
||||
echo "Status: Still running"
|
||||
else
|
||||
echo "Status: Already terminated"
|
||||
fi
|
||||
echo ""
|
||||
else
|
||||
echo "No pytest.pid found (test may not have started)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Check if jobs.txt exists
|
||||
if [ ! -f "$JOBS_FILE" ]; then
|
||||
echo "[WARN] No jobs.txt found"
|
||||
echo " Nothing to cancel"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[INFO] Reading jobs from: $JOBS_FILE"
|
||||
|
||||
# Read, deduplicate, and filter empty lines
|
||||
JOBS=$(sort -u "$JOBS_FILE" | grep -v '^$' || true)
|
||||
|
||||
if [ -z "$JOBS" ]; then
|
||||
echo "[WARN] jobs.txt is empty"
|
||||
echo " Nothing to cancel"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
JOB_COUNT=$(echo "$JOBS" | wc -l)
|
||||
echo "Found $JOB_COUNT job(s) to cancel"
|
||||
echo ""
|
||||
|
||||
# Cancel each job
|
||||
CANCELLED=0
|
||||
ALREADY_DONE=0
|
||||
FAILED=0
|
||||
|
||||
echo "Cancelling jobs..."
|
||||
while IFS= read -r job_id; do
|
||||
if [ -n "$job_id" ]; then
|
||||
printf " %-12s ... " "$job_id"
|
||||
|
||||
# Try to cancel the job
|
||||
if scancel "$job_id" 2>/dev/null; then
|
||||
echo "[OK] Cancelled"
|
||||
CANCELLED=$((CANCELLED + 1))
|
||||
else
|
||||
# Check if job exists in squeue
|
||||
if squeue -j "$job_id" -h 2>/dev/null | grep -q "$job_id"; then
|
||||
echo "[FAIL] Failed to cancel"
|
||||
FAILED=$((FAILED + 1))
|
||||
else
|
||||
echo "[SKIP] Already finished"
|
||||
ALREADY_DONE=$((ALREADY_DONE + 1))
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done <<< "$JOBS"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "[DONE] Cleanup completed"
|
||||
echo " Total: $JOB_COUNT"
|
||||
echo " Cancelled: $CANCELLED"
|
||||
echo " Already done: $ALREADY_DONE"
|
||||
echo " Failed: $FAILED"
|
||||
echo "=========================================="
|
||||
|
||||
# Exit with error if any cancellation actually failed
|
||||
if [ $FAILED -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
@ -151,6 +151,7 @@ class BatchManager:
|
||||
|
||||
self.submitted_batches = set() # Track which batch numbers have been submitted
|
||||
self.job_mapping = {} # Map test_id -> SLURM job_id
|
||||
self.submit_errors = {} # Map test_id -> error message (validation/submission failures)
|
||||
self.all_configs = [] # Ordered list of all test configs
|
||||
|
||||
logger.info(f"\n{'=' * 70}")
|
||||
@ -214,6 +215,8 @@ class BatchManager:
|
||||
batch_num: Batch number to submit (0-indexed)
|
||||
"""
|
||||
from execution.executor import JobManager
|
||||
from utils.config_validator import ConfigValidator
|
||||
from utils.job_tracker import JobTracker
|
||||
|
||||
# Calculate batch range
|
||||
if self.batch_size:
|
||||
@ -230,33 +233,56 @@ class BatchManager:
|
||||
logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
|
||||
logger.info(f"{'=' * 70}\n")
|
||||
|
||||
# Submit all jobs in this batch
|
||||
# Pre-validate all configs before submission
|
||||
logger.info("Pre-validating configurations...")
|
||||
valid_configs = []
|
||||
for config in batch_configs:
|
||||
try:
|
||||
ConfigValidator.validate_test_config(config)
|
||||
valid_configs.append(config)
|
||||
except Exception as e:
|
||||
# Validation failed - mark as None and record error
|
||||
self.job_mapping[config.test_id] = None
|
||||
self.submit_errors[config.test_id] = f"Validation failed: {str(e)}"
|
||||
logger.error(f" [FAILED] Validation failed: {config.test_id}")
|
||||
logger.error(f" Error: {str(e)[:100]}")
|
||||
|
||||
logger.info(
|
||||
f"Validation complete: {len(valid_configs)}/{len(batch_configs)} configs valid\n"
|
||||
)
|
||||
|
||||
# Submit only valid configs
|
||||
success_count = 0
|
||||
for i, config in enumerate(batch_configs, 1):
|
||||
for i, config in enumerate(valid_configs, 1):
|
||||
try:
|
||||
success, job_id = JobManager.submit_test_job(config)
|
||||
if success and job_id:
|
||||
self.job_mapping[config.test_id] = job_id
|
||||
JobTracker.record_job(job_id) # Record job ID for cleanup
|
||||
success_count += 1
|
||||
# Truncate test_id for display
|
||||
display_id = (
|
||||
config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
|
||||
logger.success(
|
||||
f" [{i:3d}/{len(valid_configs)}] Job {job_id} <- {config.test_id}"
|
||||
)
|
||||
logger.success(f" [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
|
||||
else:
|
||||
# Submission failed - mark as None and record error
|
||||
self.job_mapping[config.test_id] = None
|
||||
logger.error(f" [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}")
|
||||
self.submit_errors[config.test_id] = f"Job submission failed: {job_id}"
|
||||
logger.error(f" [{i:3d}/{len(valid_configs)}] Failed: {config.test_id}")
|
||||
except Exception as e:
|
||||
# Submission exception - mark as None and record error
|
||||
self.job_mapping[config.test_id] = None
|
||||
logger.error(f" [{i:3d}/{len(batch_configs)}] Error: {e}")
|
||||
self.submit_errors[config.test_id] = f"Submission exception: {str(e)}"
|
||||
logger.error(f" [{i:3d}/{len(valid_configs)}] Error: {e}")
|
||||
|
||||
# Mark batch as submitted
|
||||
self.submitted_batches.add(batch_num)
|
||||
|
||||
logger.info(f"\n{'=' * 70}")
|
||||
logger.success(
|
||||
f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded"
|
||||
f"Batch {batch_num} Complete: {success_count}/{len(valid_configs)} submitted successfully"
|
||||
)
|
||||
if len(valid_configs) < len(batch_configs):
|
||||
logger.warning(f"Skipped {len(batch_configs) - len(valid_configs)} invalid config(s)")
|
||||
logger.info(f"{'=' * 70}\n")
|
||||
|
||||
|
||||
|
||||
@ -271,7 +271,7 @@ class JobManager:
|
||||
|
||||
@staticmethod
|
||||
def backup_logs(
|
||||
job_id: str,
|
||||
job_id: Optional[str],
|
||||
test_config,
|
||||
result_dir: str,
|
||||
is_passed: bool,
|
||||
@ -279,13 +279,18 @@ class JobManager:
|
||||
"""Backup logs and config files to test_id directory.
|
||||
|
||||
Args:
|
||||
job_id: SLURM job ID
|
||||
job_id: SLURM job ID (None if submission failed)
|
||||
test_config: TestConfig object
|
||||
result_dir: Result directory path (already named as test_id)
|
||||
is_passed: Whether the job passed
|
||||
Returns:
|
||||
Final directory path if successful, None otherwise
|
||||
"""
|
||||
if job_id is None:
|
||||
logger.warning(f"Job submission failed for {test_config.test_id}")
|
||||
else:
|
||||
logger.info(f"Backing up logs for job {job_id} ({test_config.test_id})")
|
||||
|
||||
if not os.path.exists(result_dir):
|
||||
logger.warning(f"Result directory does not exist yet: {result_dir}")
|
||||
return None
|
||||
|
||||
@ -92,6 +92,13 @@ class HypothesisTestingParams:
|
||||
# Dataset default parameters for hypothesis testing
|
||||
# Extracted from accuracy_core.py AccuracyTask subclasses
|
||||
DATASET_DEFAULTS = {
|
||||
"aime25": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
"sigma": 50,
|
||||
"num_samples": 30, # AIME 2025 full sample size
|
||||
"higher_is_better": True,
|
||||
},
|
||||
"gsm8k": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
@ -127,6 +134,14 @@ DATASET_DEFAULTS = {
|
||||
"num_samples": 198,
|
||||
"higher_is_better": True,
|
||||
},
|
||||
# Alias for gpqa_diamond (same task, different naming convention)
|
||||
"gpqa_diamond_cot_zeroshot": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
"sigma": 50,
|
||||
"num_samples": 198,
|
||||
"higher_is_better": True,
|
||||
},
|
||||
"json_mode_eval": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
|
||||
@ -22,44 +22,18 @@ cd "$WORK_DIR"
|
||||
python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
|
||||
echo "System information collection completed"
|
||||
|
||||
# Step 2: Handle different installation modes
|
||||
echo ""
|
||||
echo "Step 2: Installing TensorRT-LLM..."
|
||||
# Step 2: Collect TensorRT-LLM version information (only for none mode)
|
||||
if [ "$INSTALL_MODE" = "none" ]; then
|
||||
echo "Using built-in TensorRT-LLM, skipping installation"
|
||||
|
||||
elif [ "$INSTALL_MODE" = "wheel" ]; then
|
||||
echo "Installing TensorRT-LLM wheel..."
|
||||
echo "Wheel path pattern: $WHEEL_PATH"
|
||||
|
||||
# Expand wildcard and install
|
||||
for wheel_file in $WHEEL_PATH; do
|
||||
if [ -f "$wheel_file" ]; then
|
||||
echo "Found wheel: $wheel_file"
|
||||
pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
|
||||
break
|
||||
fi
|
||||
done
|
||||
echo "Wheel installation completed"
|
||||
|
||||
elif [ "$INSTALL_MODE" = "source" ]; then
|
||||
echo "Installing TensorRT-LLM from source..."
|
||||
cd "$REPO_DIR"
|
||||
pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
|
||||
echo "Source installation completed"
|
||||
|
||||
echo ""
|
||||
echo "Step 2: Collecting TensorRT-LLM version information..."
|
||||
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
|
||||
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
|
||||
echo "TensorRT-LLM version written to: $VERSION_FILE"
|
||||
else
|
||||
echo "ERROR: Invalid install mode: $INSTALL_MODE"
|
||||
exit 1
|
||||
echo ""
|
||||
echo "Step 2: Skipping TensorRT-LLM version collection (install_mode=$INSTALL_MODE)"
|
||||
fi
|
||||
|
||||
# Step 3: Collect TensorRT-LLM version information
|
||||
echo ""
|
||||
echo "Step 3: Collecting TensorRT-LLM version information..."
|
||||
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
|
||||
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
|
||||
echo "TensorRT-LLM version written to: $VERSION_FILE"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Session Collect Job Completed"
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -22,7 +22,7 @@ benchmark:
|
||||
multi_round: 8
|
||||
benchmark_ratio: 0.8
|
||||
streaming: true
|
||||
concurrency_list: '6144'
|
||||
concurrency_list: '1024'
|
||||
input_length: 1024
|
||||
output_length: 1024
|
||||
dataset_file: <dataset_file>
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
@ -89,7 +92,7 @@ worker_config:
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 4608
|
||||
backend: NIXLf
|
||||
backend: NIXL
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -82,6 +82,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -82,6 +82,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -47,6 +47,11 @@ else:
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def session_lifecycle():
|
||||
"""Session lifecycle management."""
|
||||
from utils.job_tracker import JobTracker
|
||||
|
||||
# Record pytest main process PID for GitLab CI cleanup
|
||||
JobTracker.record_pid()
|
||||
|
||||
session_tracker.start()
|
||||
try:
|
||||
yield
|
||||
@ -66,11 +71,8 @@ class TestDisaggBenchmark:
|
||||
"""Performance benchmark test for YAML configurations."""
|
||||
full_test_name = request.node.name
|
||||
|
||||
# Validate configuration first (before any other operations)
|
||||
try:
|
||||
ConfigValidator.validate_test_config(test_config)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Configuration validation failed: {e}")
|
||||
# Note: Configuration validation is done during batch submission (in conftest.py)
|
||||
# If validation failed, job_id will be None and the assert below will fail
|
||||
|
||||
# Create test case tracker
|
||||
test_tracker = TestCaseTracker()
|
||||
@ -104,8 +106,11 @@ class TestDisaggBenchmark:
|
||||
# Get job_id from batch manager (auto-submits batch if needed)
|
||||
job_id = batch_manager.get_job_id(test_config)
|
||||
|
||||
# Validate submission result
|
||||
assert job_id, f"Failed to get job_id for {test_config.test_id}"
|
||||
# Validate submission result (will be None if validation/submission failed)
|
||||
error_msg = batch_manager.submit_errors.get(
|
||||
test_config.test_id, "Check batch submission logs for details"
|
||||
)
|
||||
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
|
||||
|
||||
# Wait for completion (timeout: 10 hours = 36000 seconds)
|
||||
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
|
||||
@ -125,13 +130,12 @@ class TestDisaggBenchmark:
|
||||
raise e
|
||||
finally:
|
||||
# Always backup logs, regardless of success or failure
|
||||
if job_id:
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
|
||||
@pytest.mark.accuracy
|
||||
@pytest.mark.parametrize("test_config", ACCURACY_TEST_CASES)
|
||||
@ -204,13 +208,12 @@ class TestDisaggBenchmark:
|
||||
raise e
|
||||
finally:
|
||||
# Always backup logs, regardless of success or failure
|
||||
if job_id:
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
|
||||
@pytest.mark.stress
|
||||
@pytest.mark.parametrize("test_config", STRESS_TEST_CASES)
|
||||
@ -222,11 +225,8 @@ class TestDisaggBenchmark:
|
||||
"""
|
||||
full_test_name = request.node.name
|
||||
|
||||
# Validate configuration first (before any other operations)
|
||||
try:
|
||||
ConfigValidator.validate_test_config(test_config)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Configuration validation failed: {e}")
|
||||
# Note: Configuration validation is done during batch submission (in conftest.py)
|
||||
# If validation failed, job_id will be None and the assert below will fail
|
||||
|
||||
# Create test case tracker
|
||||
test_tracker = TestCaseTracker()
|
||||
@ -266,8 +266,11 @@ class TestDisaggBenchmark:
|
||||
# Get job_id from batch manager (auto-submits batch if needed)
|
||||
job_id = batch_manager.get_job_id(test_config)
|
||||
|
||||
# Validate submission result
|
||||
assert job_id, f"Failed to get job_id for {test_config.test_id}"
|
||||
# Validate submission result (will be None if validation/submission failed)
|
||||
error_msg = batch_manager.submit_errors.get(
|
||||
test_config.test_id, "Check batch submission logs for details"
|
||||
)
|
||||
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
|
||||
|
||||
# Wait for completion (timeout: 10 hours = 36000 seconds)
|
||||
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
|
||||
@ -287,13 +290,12 @@ class TestDisaggBenchmark:
|
||||
raise e
|
||||
finally:
|
||||
# Always backup logs, regardless of success or failure
|
||||
if job_id:
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
61
tests/integration/defs/perf/disagg/utils/job_tracker.py
Normal file
61
tests/integration/defs/perf/disagg/utils/job_tracker.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""Simple job and process tracker for GitLab CI cleanup."""
|
||||
|
||||
import os
|
||||
|
||||
from utils.common import EnvManager
|
||||
from utils.logger import logger
|
||||
|
||||
|
||||
class JobTracker:
|
||||
"""Track SLURM job IDs and pytest PID for GitLab CI cleanup."""
|
||||
|
||||
@staticmethod
|
||||
def get_jobs_file() -> str:
|
||||
"""Get jobs.txt file path in output_path."""
|
||||
output_path = EnvManager.get_output_path()
|
||||
return os.path.join(output_path, "jobs.txt")
|
||||
|
||||
@staticmethod
|
||||
def get_pid_file() -> str:
|
||||
"""Get pytest.pid file path in output_path."""
|
||||
output_path = EnvManager.get_output_path()
|
||||
return os.path.join(output_path, "pytest.pid")
|
||||
|
||||
@staticmethod
|
||||
def record_pid():
|
||||
"""Record pytest main process PID to pytest.pid file."""
|
||||
pid = os.getpid()
|
||||
pid_file = JobTracker.get_pid_file()
|
||||
try:
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(pid_file), exist_ok=True)
|
||||
|
||||
# Write PID
|
||||
with open(pid_file, "w") as f:
|
||||
f.write(f"{pid}\n")
|
||||
f.flush()
|
||||
|
||||
logger.info(f"Recorded pytest PID: {pid} -> {pid_file}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to record PID: {e}")
|
||||
|
||||
@staticmethod
|
||||
def record_job(job_id: str):
|
||||
"""Append SLURM job ID to jobs.txt file.
|
||||
|
||||
Args:
|
||||
job_id: SLURM job ID to record
|
||||
"""
|
||||
jobs_file = JobTracker.get_jobs_file()
|
||||
try:
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(jobs_file), exist_ok=True)
|
||||
|
||||
# Append job ID
|
||||
with open(jobs_file, "a") as f:
|
||||
f.write(f"{job_id}\n")
|
||||
f.flush()
|
||||
|
||||
logger.debug(f"Recorded SLURM job: {job_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to record job ID {job_id}: {e}")
|
||||
@ -79,6 +79,8 @@ class SessionTracker:
|
||||
Uses the new sbatch-based approach for non-blocking execution.
|
||||
Submits the job and waits for completion using JobManager.
|
||||
"""
|
||||
from utils.job_tracker import JobTracker
|
||||
|
||||
self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
logger.info(f"Session ended: {self.end_time}")
|
||||
|
||||
@ -89,6 +91,9 @@ class SessionTracker:
|
||||
logger.error(f"Failed to submit session collect job: {job_id}")
|
||||
return False
|
||||
|
||||
# Record session collect job ID for cleanup
|
||||
JobTracker.record_job(job_id)
|
||||
|
||||
# Wait for job completion (reuses wait_for_completion method)
|
||||
logger.info(f"Waiting for session collect job {job_id} to complete...")
|
||||
JobManager.wait_for_completion(
|
||||
|
||||
@ -22,7 +22,7 @@ import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from defs.trt_test_alternative import print_error, print_info, print_warning
|
||||
from defs.trt_test_alternative import print_info, print_warning
|
||||
|
||||
_project_root = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), '../../../..'))
|
||||
@ -78,6 +78,7 @@ PRE_MERGE_THRESHOLD = 0.1
|
||||
# scenario, allowing the underlying config to change while still comparing against baselines
|
||||
# for the same scenario.
|
||||
SCENARIO_MATCH_FIELDS = [
|
||||
"s_gpu_type",
|
||||
"s_runtime",
|
||||
"s_model_name",
|
||||
"l_isl",
|
||||
@ -282,28 +283,27 @@ def query_history_data(common_values_dict):
|
||||
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned no response"
|
||||
)
|
||||
return None
|
||||
else:
|
||||
payload = res.json().get("hits", {}).get("hits", [])
|
||||
if len(payload) == 0:
|
||||
# No history data found in database, return empty list
|
||||
print_info(
|
||||
f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
|
||||
)
|
||||
return []
|
||||
for hit in payload:
|
||||
data_dict = hit.get("_source", {})
|
||||
data_dict["_id"] = hit.get("_id", "")
|
||||
if data_dict["_id"] == "":
|
||||
print_info(
|
||||
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
|
||||
)
|
||||
# Invalid data, return None
|
||||
return None
|
||||
data_list.append(data_dict)
|
||||
payload = res.json().get("hits", {}).get("hits", [])
|
||||
if len(payload) == 0:
|
||||
# No history data found in database, return empty list
|
||||
print_info(
|
||||
f"Successfully queried from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
|
||||
f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
|
||||
)
|
||||
return data_list
|
||||
return []
|
||||
for hit in payload:
|
||||
data_dict = hit.get("_source", {})
|
||||
data_dict["_id"] = hit.get("_id", "")
|
||||
if data_dict["_id"] == "":
|
||||
print_info(
|
||||
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
|
||||
)
|
||||
# Invalid data, return None
|
||||
return None
|
||||
data_list.append(data_dict)
|
||||
print_info(
|
||||
f"Successfully queried from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
|
||||
)
|
||||
return data_list
|
||||
except Exception as e:
|
||||
print_info(
|
||||
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned error: {e}"
|
||||
@ -522,7 +522,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
# Add metric info to s_regression_info
|
||||
metric_info = (f"{metric}'s value: {new_value} "
|
||||
f"baseline value: {baseline_value} "
|
||||
f"threshold: {threshold} "
|
||||
f"threshold: {threshold * 100:.2f}% "
|
||||
f"diff: {diff:+.2f}%")
|
||||
info_parts.append(metric_info)
|
||||
|
||||
@ -643,65 +643,19 @@ def _get_metric_keys():
|
||||
return metric_keys
|
||||
|
||||
|
||||
def _print_perf_data(data):
|
||||
"""Print performance metrics and config for a single data entry."""
|
||||
print_info("=== Metrics ===")
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
if metric in data:
|
||||
value = data.get(metric, "N/A")
|
||||
print_info(f'"{metric}": {value}')
|
||||
|
||||
metric_keys = _get_metric_keys()
|
||||
print_info("\n=== Config ===")
|
||||
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||
for key in config_keys:
|
||||
value = data[key]
|
||||
print_info(f'"{key}": {value}')
|
||||
|
||||
|
||||
def _print_regression_data(data, print_func=None):
|
||||
"""
|
||||
Print regression info, metrics with baselines/thresholds, and config.
|
||||
Print regression info and config.
|
||||
"""
|
||||
if print_func is None:
|
||||
print_func = print_info
|
||||
|
||||
if "s_regression_info" in data:
|
||||
print_func("=== Regression Info ===")
|
||||
print_func(f"{data['s_regression_info']}")
|
||||
for item in data["s_regression_info"].split(","):
|
||||
print_func(item.strip())
|
||||
|
||||
metric_keys = _get_metric_keys()
|
||||
is_post_merge = data.get("b_is_post_merge", False)
|
||||
|
||||
print_func("=== Metrics ===")
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
metric_suffix = metric[2:] # Strip "d_" prefix
|
||||
baseline_key = f"d_baseline_{metric_suffix}"
|
||||
if is_post_merge:
|
||||
threshold_key = f"d_threshold_post_merge_{metric_suffix}"
|
||||
else:
|
||||
threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
|
||||
# Only print if at least one of the keys exists
|
||||
if metric in data or baseline_key in data or threshold_key in data:
|
||||
value = data.get(metric, "N/A")
|
||||
baseline = data.get(baseline_key, "N/A")
|
||||
threshold = data.get(threshold_key, "N/A")
|
||||
# Calculate percentage difference between value and baseline
|
||||
# Positive percentage means better perf, negative means regression
|
||||
if (isinstance(value, (int, float))
|
||||
and isinstance(baseline, (int, float)) and baseline != 0):
|
||||
if metric in MAXIMIZE_METRICS:
|
||||
# Larger is better: value > baseline is positive (better)
|
||||
percentage = (value - baseline) / baseline * 100
|
||||
else:
|
||||
# Smaller is better: value < baseline is positive (better)
|
||||
percentage = (baseline - value) / baseline * 100
|
||||
percentage_str = f"{percentage:+.2f}%"
|
||||
else:
|
||||
percentage_str = "N/A"
|
||||
print_func(
|
||||
f'"{metric}": {value}, "{baseline_key}": {baseline}, '
|
||||
f'"{threshold_key}": {threshold}, "diff": {percentage_str}')
|
||||
|
||||
print_func("\n=== Config ===")
|
||||
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||
@ -712,16 +666,17 @@ def _print_regression_data(data, print_func=None):
|
||||
print_func(f'"{key}": {value}')
|
||||
|
||||
|
||||
def check_perf_regression(new_data_dict):
|
||||
def check_perf_regression(new_data_dict, fail_on_regression=False):
|
||||
"""
|
||||
Check performance regression by printing regression data from new_data_dict.
|
||||
If fail_on_regression is True, raises RuntimeError when regressions are found.
|
||||
(This is a temporary feature to fail regression tests. We are observing the stability and will fail them by default soon.)
|
||||
"""
|
||||
# Filter regression data from new_data_dict
|
||||
regressive_data_list = [
|
||||
data for data in new_data_dict.values()
|
||||
if data.get("b_is_regression", False)
|
||||
]
|
||||
|
||||
# Split regression data into post-merge and pre-merge
|
||||
post_merge_regressions = [
|
||||
data for data in regressive_data_list
|
||||
@ -735,24 +690,34 @@ def check_perf_regression(new_data_dict):
|
||||
# Print pre-merge regression data with print_warning
|
||||
if len(pre_merge_regressions) > 0:
|
||||
print_warning(
|
||||
f"Found {len(pre_merge_regressions)} pre-merge regression data")
|
||||
f"Found {len(pre_merge_regressions)} pre-merge perf regression data"
|
||||
)
|
||||
for i, data in enumerate(pre_merge_regressions):
|
||||
print_warning(f"\n{'=' * 60}")
|
||||
print_warning(f"Pre-merge Regression Data #{i + 1}")
|
||||
print_warning("=" * 60)
|
||||
_print_regression_data(data, print_func=print_warning)
|
||||
|
||||
# Print post-merge regression data with print_error
|
||||
if fail_on_regression:
|
||||
raise RuntimeError(
|
||||
f"Found {len(pre_merge_regressions)} pre-merge perf regression data"
|
||||
)
|
||||
|
||||
# Print post-merge regression data with print_warning
|
||||
if len(post_merge_regressions) > 0:
|
||||
print_warning(
|
||||
f"Found {len(post_merge_regressions)} post-merge perf regression data"
|
||||
)
|
||||
for i, data in enumerate(post_merge_regressions):
|
||||
print_error(f"\n{'=' * 60}")
|
||||
print_error(f"Post-merge Regression Data #{i + 1}")
|
||||
print_error("=" * 60)
|
||||
_print_regression_data(data, print_func=print_error)
|
||||
print_error(
|
||||
f"Found {len(post_merge_regressions)} post-merge regression data")
|
||||
raise RuntimeError(
|
||||
f"Found {len(post_merge_regressions)} post-merge regression data")
|
||||
print_warning(f"\n{'=' * 60}")
|
||||
print_warning(f"Post-merge Regression Data #{i + 1}")
|
||||
print_warning("=" * 60)
|
||||
_print_regression_data(data, print_func=print_warning)
|
||||
|
||||
if fail_on_regression:
|
||||
raise RuntimeError(
|
||||
f"Found {len(post_merge_regressions)} post-merge perf regression data"
|
||||
)
|
||||
|
||||
# Print summary if no regressions
|
||||
if len(regressive_data_list) == 0:
|
||||
|
||||
@ -23,7 +23,7 @@ import re
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Dict, List, NamedTuple, Tuple
|
||||
from typing import Dict, List, NamedTuple, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
@ -58,6 +58,7 @@ MODEL_PATH_DICT = {
|
||||
}
|
||||
|
||||
SUPPORTED_GPU_TYPE = [
|
||||
"H200",
|
||||
"B200",
|
||||
"B300",
|
||||
"GB200",
|
||||
@ -124,6 +125,7 @@ class ServerConfig:
|
||||
self.model_name = server_config_data["model_name"]
|
||||
self.model_path = ""
|
||||
self.env_vars = env_vars
|
||||
self.disagg_run_type = server_config_data.get("disagg_run_type", "aggr")
|
||||
|
||||
# Extract optional fields with defaults
|
||||
self.tp = server_config_data.get("tensor_parallel_size", 1)
|
||||
@ -220,9 +222,12 @@ class ServerConfig:
|
||||
"concurrency",
|
||||
"name",
|
||||
"model_name",
|
||||
"disagg_run_type",
|
||||
"gpus",
|
||||
"gpus_per_node",
|
||||
"match_mode",
|
||||
"client_configs",
|
||||
"match_mode",
|
||||
]
|
||||
self.extra_llm_api_config_data = {
|
||||
k: v for k, v in server_config_data.items() if k not in exclude_keys
|
||||
@ -234,7 +239,7 @@ class ServerConfig:
|
||||
"""Generate server command."""
|
||||
model_dir = get_model_dir(self.model_name)
|
||||
self.model_path = model_dir if os.path.exists(model_dir) else self.model_name
|
||||
config_filename = f"extra-llm-api-config.{self.name}.yml"
|
||||
config_filename = f"extra-llm-api-config.{self.disagg_run_type}.{self.name}.yml"
|
||||
config_path = os.path.join(output_dir, config_filename)
|
||||
|
||||
numa_bind_cmd = []
|
||||
@ -517,7 +522,9 @@ class AggrTestCmds(NamedTuple):
|
||||
)
|
||||
|
||||
wait_for_endpoint_ready(
|
||||
f"http://{server_hostname}:{server_port}/health", timeout=self.timeout
|
||||
f"http://{server_hostname}:{server_port}/health",
|
||||
timeout=self.timeout,
|
||||
server_proc=server_proc,
|
||||
)
|
||||
|
||||
# Run all clients for this server
|
||||
@ -667,10 +674,13 @@ class DisaggTestCmds(NamedTuple):
|
||||
break
|
||||
time.sleep(10)
|
||||
|
||||
def wait_for_endpoint_ready(self, url: str):
|
||||
def wait_for_endpoint_ready(self, url: str, server_files: List[str] = None):
|
||||
"""Wait for endpoint to be ready."""
|
||||
start = time.monotonic()
|
||||
iteration = 0
|
||||
error_keywords = ["RuntimeError", "out of memory", "ValueError"]
|
||||
while True:
|
||||
iteration += 1
|
||||
elapsed_time = time.monotonic() - start
|
||||
if elapsed_time > self.timeout:
|
||||
print_error(
|
||||
@ -678,6 +688,22 @@ class DisaggTestCmds(NamedTuple):
|
||||
)
|
||||
break
|
||||
print_info(f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s")
|
||||
|
||||
if server_files and iteration % 30 == 0:
|
||||
for server_file in server_files:
|
||||
if os.path.exists(server_file):
|
||||
try:
|
||||
with open(server_file, "r") as f:
|
||||
content = f.read()
|
||||
for line in content.splitlines():
|
||||
for keyword in error_keywords:
|
||||
if keyword in line:
|
||||
print_error(
|
||||
f"Found '{keyword}' in server file {server_file}: {line}"
|
||||
)
|
||||
except Exception as e:
|
||||
print_info(f"Failed to read server file {server_file}: {e}")
|
||||
|
||||
try:
|
||||
time.sleep(10)
|
||||
if requests.get(url).status_code == 200:
|
||||
@ -693,7 +719,6 @@ class DisaggTestCmds(NamedTuple):
|
||||
port = get_free_port()
|
||||
|
||||
ctx_cmd, gen_cmd, disagg_cmd = self.server_cmds[server_idx]
|
||||
|
||||
if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type:
|
||||
self._generate_hostname_file(server_idx, port)
|
||||
server_file_path = os.path.join(
|
||||
@ -702,7 +727,6 @@ class DisaggTestCmds(NamedTuple):
|
||||
is_ctx = "CTX" in self.disagg_serving_type
|
||||
server_cmd = ctx_cmd if is_ctx else gen_cmd
|
||||
server_cmd = add_host_port_to_cmd(server_cmd, self.hostname, port)
|
||||
|
||||
try:
|
||||
print_info(
|
||||
f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd}"
|
||||
@ -724,7 +748,6 @@ class DisaggTestCmds(NamedTuple):
|
||||
disagg_server_file_path = os.path.join(
|
||||
self.output_dir, f"trtllm-serve.{server_idx}.{self.disagg_serving_type}.log"
|
||||
)
|
||||
|
||||
try:
|
||||
self._generate_disagg_server_config(server_idx, port)
|
||||
print_info(f"Starting disagg server. cmd is {disagg_cmd}")
|
||||
@ -746,8 +769,24 @@ class DisaggTestCmds(NamedTuple):
|
||||
disagg_server_hostname, disagg_server_port = (
|
||||
self._get_disagg_server_hostname_and_port(server_idx)
|
||||
)
|
||||
server_files = [
|
||||
os.path.join(self.output_dir, f"trtllm-serve.{server_idx}.DISAGG_SERVER.log"),
|
||||
]
|
||||
for ctx_idx in range(self.num_ctx_servers):
|
||||
server_files.append(
|
||||
os.path.join(
|
||||
self.output_dir, f"trtllm-serve.{server_idx}.CTX_{ctx_idx}.log"
|
||||
)
|
||||
)
|
||||
for gen_idx in range(self.num_gen_servers):
|
||||
server_files.append(
|
||||
os.path.join(
|
||||
self.output_dir, f"trtllm-serve.{server_idx}.GEN_{gen_idx}.log"
|
||||
)
|
||||
)
|
||||
self.wait_for_endpoint_ready(
|
||||
f"http://{disagg_server_hostname}:{disagg_server_port}/health"
|
||||
f"http://{disagg_server_hostname}:{disagg_server_port}/health",
|
||||
server_files=server_files,
|
||||
)
|
||||
|
||||
# Run all clients for this server
|
||||
@ -799,7 +838,6 @@ class PerfSanityTestConfig:
|
||||
|
||||
def __init__(self, test_case_name: str, output_dir: str):
|
||||
self._output_dir = output_dir
|
||||
self._test_results: Dict[int, Dict[str, float]] = {}
|
||||
self._perf_results: Dict[int, List[Dict[str, float]]] = {}
|
||||
|
||||
# Parse test case name
|
||||
@ -977,6 +1015,7 @@ class PerfSanityTestConfig:
|
||||
"name": config_file_base_name,
|
||||
"model_name": model_name,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
"disagg_run_type": "ctx",
|
||||
**worker_config.get("ctx", {}),
|
||||
}
|
||||
|
||||
@ -986,6 +1025,7 @@ class PerfSanityTestConfig:
|
||||
"name": config_file_base_name,
|
||||
"model_name": model_name,
|
||||
"gpus_per_node": gpus_per_node,
|
||||
"disagg_run_type": "gen",
|
||||
**worker_config.get("gen", {}),
|
||||
}
|
||||
|
||||
@ -1047,7 +1087,7 @@ class PerfSanityTestConfig:
|
||||
|
||||
# Generate extra-llm-api-config.yml
|
||||
config_content = server_config.generate_extra_llm_api_config()
|
||||
config_filename = f"extra-llm-api-config.{server_config.name}.yml"
|
||||
config_filename = f"extra-llm-api-config.aggr.{server_config.name}.yml"
|
||||
config_path = os.path.join(output_dir, config_filename)
|
||||
with open(config_path, "w") as f:
|
||||
f.write(config_content)
|
||||
@ -1080,7 +1120,9 @@ class PerfSanityTestConfig:
|
||||
ctx_cmd = ctx_config.to_cmd(output_dir, numa_bind, "CTX")
|
||||
if "CTX" in disagg_serving_type:
|
||||
config_content = ctx_config.generate_extra_llm_api_config()
|
||||
config_path = os.path.join(output_dir, "extra-llm-api-config.ctx.yml")
|
||||
config_path = os.path.join(
|
||||
output_dir, f"extra-llm-api-config.ctx.{ctx_config.name}.yml"
|
||||
)
|
||||
with open(config_path, "w") as f:
|
||||
f.write(config_content)
|
||||
|
||||
@ -1088,7 +1130,9 @@ class PerfSanityTestConfig:
|
||||
gen_cmd = gen_config.to_cmd(output_dir, numa_bind, "GEN")
|
||||
if "GEN" in disagg_serving_type:
|
||||
config_content = gen_config.generate_extra_llm_api_config()
|
||||
config_path = os.path.join(output_dir, "extra-llm-api-config.gen.yml")
|
||||
config_path = os.path.join(
|
||||
output_dir, f"extra-llm-api-config.gen.{gen_config.name}.yml"
|
||||
)
|
||||
with open(config_path, "w") as f:
|
||||
f.write(config_content)
|
||||
|
||||
@ -1165,44 +1209,59 @@ class PerfSanityTestConfig:
|
||||
if failed_requests_match:
|
||||
failed_count = int(failed_requests_match.group(1))
|
||||
if failed_count > 0:
|
||||
print_error(f"Benchmark output contains {failed_count} failed requests.")
|
||||
raise Exception(f"Benchmark has {failed_count} failed requests")
|
||||
error_msg = f"Benchmark output contains {failed_count} failed requests."
|
||||
raise Exception(error_msg)
|
||||
|
||||
# Check for explicit failure markers
|
||||
if "!FAILED REQUESTS!" in output or "!CHECK LOG FOR ERRORS!" in output:
|
||||
print_error("Benchmark output contains failure markers.")
|
||||
raise Exception("Benchmark output contains failure markers")
|
||||
error_msg = "Benchmark output contains failure markers."
|
||||
raise Exception(error_msg)
|
||||
|
||||
def get_perf_result(self, outputs: Dict[int, List[str]]):
|
||||
"""Parse performance results from outputs."""
|
||||
self._perf_results = {}
|
||||
|
||||
for server_idx, server_outputs in outputs.items():
|
||||
self._perf_results[server_idx] = []
|
||||
|
||||
for output in server_outputs:
|
||||
metrics = {}
|
||||
def parse_metrics_from_output(output: str) -> Optional[Dict[str, float]]:
|
||||
"""Parse all metrics from a single output string."""
|
||||
metrics = {}
|
||||
for line in output.split("\n"):
|
||||
for metric_type, regex in PERF_METRIC_LOG_QUERIES.items():
|
||||
regex_matches = [regex.search(line) for line in output.split("\n")]
|
||||
for match in regex_matches:
|
||||
if match:
|
||||
value = None
|
||||
for i in range(1, len(match.groups()) + 1):
|
||||
if match.group(i) is not None:
|
||||
value = match.group(i)
|
||||
break
|
||||
if value is not None:
|
||||
metrics[metric_type] = float(value)
|
||||
break
|
||||
if metric_type in metrics:
|
||||
continue
|
||||
match = regex.search(line)
|
||||
if match:
|
||||
metrics[metric_type] = float(match.group(1))
|
||||
break
|
||||
return metrics
|
||||
|
||||
self._perf_results = {}
|
||||
for server_idx, client_configs in self.server_client_configs.items():
|
||||
self._perf_results[server_idx] = []
|
||||
server_outputs = outputs.get(server_idx, [])
|
||||
for output in server_outputs:
|
||||
metrics = parse_metrics_from_output(output)
|
||||
self._perf_results[server_idx].append(metrics)
|
||||
|
||||
# Also populate _test_results for upload (flattened view)
|
||||
cmd_idx = 0
|
||||
for server_idx in sorted(self._perf_results.keys()):
|
||||
for client_metrics in self._perf_results[server_idx]:
|
||||
self._test_results[cmd_idx] = client_metrics
|
||||
cmd_idx += 1
|
||||
def check_test_failure(self):
|
||||
"""Check if any server failed based on perf results."""
|
||||
error_msg = ""
|
||||
for server_idx, client_configs in self.server_client_configs.items():
|
||||
server_perf_results = self._perf_results.get(server_idx, [])
|
||||
if len(server_perf_results) != len(client_configs):
|
||||
error_msg += (
|
||||
f"Server {server_idx}'s perf results number: {len(server_perf_results)} "
|
||||
f"is not equal to client number: {len(client_configs)}. "
|
||||
)
|
||||
for client_idx, metrics in enumerate(server_perf_results):
|
||||
if len(metrics) != len(PERF_METRIC_LOG_QUERIES):
|
||||
error_msg += (
|
||||
f"Some metrics in Server {server_idx} Client {client_idx} are missing. "
|
||||
f"The broken metrics is {metrics}. "
|
||||
)
|
||||
|
||||
if error_msg:
|
||||
raise Exception(error_msg)
|
||||
|
||||
print_info("All servers passed")
|
||||
|
||||
def upload_test_results_to_database(self):
|
||||
"""Upload test results and baseline to database."""
|
||||
@ -1219,25 +1278,27 @@ class PerfSanityTestConfig:
|
||||
return {add_prefix(key, prefix_name): value for key, value in config_dict.items()}
|
||||
|
||||
match_keys = []
|
||||
is_scenario_mode = False
|
||||
|
||||
if self.runtime == "aggr_server":
|
||||
job_config = get_job_info()
|
||||
is_post_merge = job_config["b_is_post_merge"]
|
||||
new_data_dict = {}
|
||||
cmd_idx = 0
|
||||
|
||||
for server_idx, client_configs in self.server_client_configs.items():
|
||||
server_config = self.server_configs[server_idx]
|
||||
server_config_dict = server_config.to_db_data()
|
||||
server_perf_results = self._perf_results.get(server_idx, [])
|
||||
# Skip if server failed
|
||||
if len(server_perf_results) != len(client_configs):
|
||||
cmd_idx += len(client_configs)
|
||||
continue
|
||||
|
||||
for client_config in client_configs:
|
||||
for client_idx, client_config in enumerate(client_configs):
|
||||
client_config_dict = client_config.to_db_data()
|
||||
|
||||
# Skip if metrics missing
|
||||
if cmd_idx not in self._test_results or not all(
|
||||
metric_name in self._test_results[cmd_idx]
|
||||
for metric_name in PERF_METRIC_LOG_QUERIES
|
||||
):
|
||||
if server_perf_results[client_idx] is None:
|
||||
print_info(
|
||||
f"Skipped posting command {cmd_idx}'s test results since some metrics are missing."
|
||||
)
|
||||
@ -1257,18 +1318,18 @@ class PerfSanityTestConfig:
|
||||
new_data["s_test_case_name"] = f"{server_config.name}-{client_config.name}"
|
||||
|
||||
for metric_name in PERF_METRIC_LOG_QUERIES:
|
||||
if metric_name in self._test_results[cmd_idx]:
|
||||
new_data[f"d_{metric_name}"] = self._test_results[cmd_idx][metric_name]
|
||||
new_data[f"d_{metric_name}"] = server_perf_results[client_idx][metric_name]
|
||||
|
||||
add_id(new_data)
|
||||
new_data_dict[cmd_idx] = new_data
|
||||
cmd_idx += 1
|
||||
|
||||
if not match_keys:
|
||||
match_keys.extend(["s_gpu_type", "s_runtime"])
|
||||
if server_config.match_mode == "scenario":
|
||||
match_keys = SCENARIO_MATCH_FIELDS.copy()
|
||||
is_scenario_mode = True
|
||||
else:
|
||||
match_keys.extend(["s_gpu_type", "s_runtime"])
|
||||
match_keys.extend(server_config.to_match_keys())
|
||||
match_keys.extend(client_config.to_match_keys())
|
||||
|
||||
@ -1285,12 +1346,16 @@ class PerfSanityTestConfig:
|
||||
for server_idx, (ctx_config, gen_config, disagg_config) in enumerate(
|
||||
self.server_configs
|
||||
):
|
||||
for client_config in self.server_client_configs[server_idx]:
|
||||
client_configs = self.server_client_configs[server_idx]
|
||||
server_perf_results = self._perf_results.get(server_idx, [])
|
||||
# Skip if server failed
|
||||
if len(server_perf_results) != len(client_configs):
|
||||
cmd_idx += len(client_configs)
|
||||
continue
|
||||
|
||||
for client_idx, client_config in enumerate(client_configs):
|
||||
# Skip if metrics missing
|
||||
if cmd_idx not in self._test_results or not all(
|
||||
metric_name in self._test_results[cmd_idx]
|
||||
for metric_name in PERF_METRIC_LOG_QUERIES
|
||||
):
|
||||
if server_perf_results[client_idx] is None:
|
||||
print_info(
|
||||
f"Skipped posting command {cmd_idx}'s test results since some metrics are missing."
|
||||
)
|
||||
@ -1323,8 +1388,7 @@ class PerfSanityTestConfig:
|
||||
new_data["s_test_case_name"] = f"{disagg_config.name}-{client_config.name}"
|
||||
|
||||
for metric_name in PERF_METRIC_LOG_QUERIES:
|
||||
if metric_name in self._test_results[cmd_idx]:
|
||||
new_data[f"d_{metric_name}"] = self._test_results[cmd_idx][metric_name]
|
||||
new_data[f"d_{metric_name}"] = server_perf_results[client_idx][metric_name]
|
||||
|
||||
add_id(new_data)
|
||||
new_data_dict[cmd_idx] = new_data
|
||||
@ -1376,7 +1440,7 @@ class PerfSanityTestConfig:
|
||||
# Upload the new perf data and baseline data to database
|
||||
post_new_perf_data(new_baseline_data_dict, new_data_dict)
|
||||
|
||||
check_perf_regression(new_data_dict)
|
||||
check_perf_regression(new_data_dict, fail_on_regression=is_scenario_mode)
|
||||
|
||||
|
||||
# Perf sanity test case parameters
|
||||
@ -1479,5 +1543,8 @@ def test_e2e(output_dir, perf_sanity_test_case):
|
||||
# Parse performance results
|
||||
config.get_perf_result(outputs)
|
||||
|
||||
# Check for test failures
|
||||
config.check_test_failure()
|
||||
|
||||
# Upload results to database
|
||||
config.upload_test_results_to_database()
|
||||
|
||||
@ -23,15 +23,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -42,15 +42,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -61,21 +61,21 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -86,27 +86,27 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -117,15 +117,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -136,15 +136,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -155,15 +155,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -174,18 +174,18 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
|
||||
@ -1,391 +1,4 @@
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaModel-bert/roberta-base]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha_fp32_acc-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertModel-bert/bert-base-uncased]
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2]
|
||||
examples/test_bindings.py::test_llm_bindings_example[llama-7b]
|
||||
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-disable_weight_only]
|
||||
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-enable_weight_only]
|
||||
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only]
|
||||
examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only]
|
||||
examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only]
|
||||
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (180)
|
||||
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (180)
|
||||
examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8]
|
||||
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
|
||||
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2]
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-enable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-disable_fp8] TIMEOUT (90)
|
||||
examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1] TIMEOUT (90)
|
||||
examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:4] TIMEOUT (90)
|
||||
examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:4] TIMEOUT (90)
|
||||
examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_deep_2.4b-float16-nb:1] TIMEOUT (90)
|
||||
examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1] TIMEOUT (90)
|
||||
examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-2-27b-it-other-bfloat16-8]
|
||||
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8]
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it]
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_1gpu[non_streaming-use_py_session-disable_gemm_plugin]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_1gpu[streaming-use_cpp_session-enable_gemm_plugin]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp1pp4]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp2pp2]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp4pp1]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_bad_words_1gpu[non_streaming-use_cpp_session]
|
||||
examples/test_gpt.py::test_llm_gpt2_medium_stop_words_1gpu[streaming-use_cpp_session]
|
||||
examples/test_gpt.py::test_llm_gpt2_multi_lora_1gpu[900_stories]
|
||||
examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_cpp_session-tp1]
|
||||
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-1]
|
||||
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0]
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2]
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct]
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct]
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct]
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_1-disable_return_all_generated_tokens-num_beams_1]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_1-disable_return_all_generated_tokens-num_beams_4]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_1-return_all_generated_tokens-num_beams_1]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_1-return_all_generated_tokens-num_beams_4]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_3-disable_return_all_generated_tokens-num_beams_1]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_3-disable_return_all_generated_tokens-num_beams_4]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_1]
|
||||
examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_4]
|
||||
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16]
|
||||
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16]
|
||||
examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct]
|
||||
examples/test_granite.py::test_granite_bf16_lora[granite-3.0-2b-instruct]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_8-float16-bs1]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-llama_v2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_llama_1gpu
|
||||
examples/test_draft_target_model.py::test_llm_draft_target_llama_fp8_2gpu
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1]
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2]
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1]
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2]
|
||||
examples/test_internlm.py::test_llm_internlm2_7b_1node_1gpu[bfloat16-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:2]
|
||||
examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct]
|
||||
examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa]
|
||||
examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa]
|
||||
examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:4]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_with_fp32_acc-enable_gemm_plugin-enable_attention_plugin-nb:1]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-34b-Instruct-tp4pp1-nb:4]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-70b-hf-tp2pp2-nb:1]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1]
|
||||
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4]
|
||||
examples/test_llama.py::test_codellama_fp8_with_bf16_lora[CodeLlama-7b-Instruct]
|
||||
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v2-7b-hf]
|
||||
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf]
|
||||
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
|
||||
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b]
|
||||
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-3b]
|
||||
examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp8pp1-nb:1]
|
||||
examples/test_llama.py::test_llm_llama_lookahead_single_gpu_summary[llama-3.1-8b]
|
||||
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b]
|
||||
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b]
|
||||
examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
|
||||
examples/test_llama.py::test_llm_llama_v1_2gpu_summary[llama-7b-nb:4]
|
||||
examples/test_llama.py::test_llm_llama_v1_4gpu_paged_kv_cache[llama-3.1-8b]
|
||||
examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp16]
|
||||
examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8]
|
||||
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_awq]
|
||||
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp16]
|
||||
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8]
|
||||
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]
|
||||
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120)
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90)
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
|
||||
examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k] TIMEOUT (180)
|
||||
examples/test_llama.py::test_llm_llama_v3_dora_1gpu[commonsense-llama-v3-8b-dora-r32-llama-v3-8b-hf-base_fp16]
|
||||
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
|
||||
examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-1.4b-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-130m-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-2.8b-float16-disable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-370m-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-790m-float16-disable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-1.3b-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-2.7b-float16-disable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-370m-float16-enable_gemm_plugin]
|
||||
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-780m-float16-disable_gemm_plugin]
|
||||
examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1]
|
||||
examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
|
||||
examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1]
|
||||
examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
|
||||
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
|
||||
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
|
||||
examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
# Multimodal Executor Cpp E2E Tests
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
|
||||
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
|
||||
|
||||
examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1]
|
||||
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16]
|
||||
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8]
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16]
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16]
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16]
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16]
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
|
||||
examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
|
||||
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA]
|
||||
examples/test_qwen.py::test_llm_qwen1_5_moe_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA]
|
||||
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen1.5_72b_chat-tp4pp2-context_fmha]
|
||||
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2_72b_instruct-tp8pp1-context_fmha_fp32_acc]
|
||||
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2.5_72b_chat-tp4pp2-context_fmha]
|
||||
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2.5_72b_chat-tp8pp1-context_fmha_fp32_acc]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen1.5_7b_chat-enable_gemm_plugin-enable_weight_only]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_7b_instruct-enable_gemm_plugin-enable_weight_only]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_0.5b_chat-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_0.5b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-disable_fmha]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_7b_chat-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_vl_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_vl_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_0.5b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
|
||||
examples/test_internlm.py::test_llm_internlm2_7b_1node_1gpu[bfloat16-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:2] # 5 mins
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-enable_fmha_fp32_acc]
|
||||
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
|
||||
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen1.5_7b_chat-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_7b_instruct-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2.5_7b_instruct-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen1.5_14b_chat_int4-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen1.5_7b_chat_awq-nb:1]
|
||||
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen1.5_7b_chat-enable_ptpc-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2_7b_instruct-enable_ptpc-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2_vl_7b_instruct-enable_ptpc-nb:4]
|
||||
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2.5_7b_instruct-enable_ptpc-nb:4]
|
||||
examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16]
|
||||
examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2_0.5b_instruct]
|
||||
examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_0.5b_instruct]
|
||||
examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct]
|
||||
examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat]
|
||||
examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-flax-no_paged_cache-disable_quant-float16-enable_attn_plugin-disable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin]
|
||||
examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
|
||||
examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8]
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime]
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime]
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime]
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int4-float16-nb:1-use_cpp_runtime]
|
||||
|
||||
# Accuracy test list
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_gemm_plugin
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_attention_ootb
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_context_fmha_disabled
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_context_fmha_fp32_acc
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int8]
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4]
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=False-per_channel=False]
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=True-per_channel=True]
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_beam_search
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_beam_search_large
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_cuda_graph
|
||||
accuracy/test_cli_flow.py::TestGpt2Medium::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8
|
||||
accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8_lm_head
|
||||
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestStarcoder2_3B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestStarcoder2_15B::test_smooth_quant_ootb
|
||||
accuracy/test_cli_flow.py::TestGptNext::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestMinitron4BBase::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8
|
||||
accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestPhi2::test_tp2
|
||||
accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive
|
||||
accuracy/test_cli_flow.py::TestMamba130M::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_lookahead
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False]
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True]
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=False-chunked_context=False-typical_acceptance=False]
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False]
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False]
|
||||
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True]
|
||||
accuracy/test_cli_flow.py::TestLlama7B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama7B::test_beam_search
|
||||
accuracy/test_cli_flow.py::TestLlama7B::test_int4_gptq
|
||||
accuracy/test_cli_flow.py::TestLlama7B::test_streamingllm
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[tp2]
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[pp2]
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2]
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_tp2cp2
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_plugin
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_swiglu_plugin
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_prequantized_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_gptq_prequantized_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_weight_sparsity
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_float32
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only[int8]
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only[int4]
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8]
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_pp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant]
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-enable_fused_quant]
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-disable_fused_quant]
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant]
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_medusa_fp8_prequantized
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant_ootb
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_manage_weights
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_pp2
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_rowwise
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_weight_streaming[1.0]
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant_ootb
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int4_tp2
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_pp_reduce_scatter_tp2pp2
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[expert_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[mixed_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[no_renormalize-tensor_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-expert_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-mixed_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel]
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized
|
||||
accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 TIMEOUT (120)
|
||||
accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] TIMEOUT (90)
|
||||
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8]
|
||||
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4]
|
||||
accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_weight_only
|
||||
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_weight_only
|
||||
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_fp8
|
||||
accuracy/test_cli_flow.py::TestQwen2_1_5B::test_auto_dtype_cp4
|
||||
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_weight_only
|
||||
accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized
|
||||
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
|
||||
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
|
||||
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
|
||||
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_fp8
|
||||
accuracy/test_llm_api.py::TestQwen2_5_0_5BInstruct::test_fp8
|
||||
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_fp8
|
||||
accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8
|
||||
accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
|
||||
accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
|
||||
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
|
||||
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
|
||||
# text generation accuracy test
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
@ -418,6 +31,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
|
||||
@ -428,14 +45,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
@ -464,8 +73,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
|
||||
@ -492,9 +99,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2]
|
||||
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
|
||||
@ -522,13 +126,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[disable
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
|
||||
@ -536,40 +139,18 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
|
||||
@ -621,12 +202,98 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_m
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model]
|
||||
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus
|
||||
|
||||
# multimodal accuracy tests
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNano_V2_VLM::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
|
||||
|
||||
# disaggregated serving accuracy test
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True]
|
||||
@ -653,63 +320,12 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus
|
||||
accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4
|
||||
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNano_V2_VLM::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
|
||||
|
||||
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
|
||||
# e2e test
|
||||
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
|
||||
test_e2e.py::test_llama_e2e[use_py_session--]
|
||||
llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-codellama/CodeLlama-7b-Instruct-hf] # 5min
|
||||
llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf] # 5min
|
||||
test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--]
|
||||
test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--]
|
||||
test_e2e.py::test_mistral_e2e[use_py_session---]
|
||||
test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
|
||||
@ -720,9 +336,6 @@ test_e2e.py::test_openai_chat_harmony
|
||||
test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b]
|
||||
test_e2e.py::test_trtllm_multimodal_benchmark_serving
|
||||
|
||||
llmapi/test_llm_examples.py::test_llmapi_server_example
|
||||
# Pivot to Pytorch test cases.
|
||||
test_e2e.py::test_ptp_quickstart
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
|
||||
@ -766,47 +379,9 @@ test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-
|
||||
test_e2e.py::test_eagle3_output_consistency_4gpus[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-Llama-4-Maverick-17B-128E-Eagle3]
|
||||
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-Qwen3/qwen3-235B-eagle3]
|
||||
unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora
|
||||
examples/test_medusa.py::test_codellama_medusa_1gpu[CodeLlama-7b-Instruct]
|
||||
|
||||
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen_7b_chat]
|
||||
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen1.5_7b_chat]
|
||||
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2_7b_instruct]
|
||||
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2_0.5b_instruct]
|
||||
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2.5_1.5b_instruct]
|
||||
examples/test_medusa.py::test_phi_medusa_1gpu[phi-2]
|
||||
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3-mini-128k-instruct]
|
||||
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3-small-128k-instruct]
|
||||
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3.5-mini-instruct]
|
||||
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-4-mini-instruct]
|
||||
examples/test_eagle.py::test_codellama_eagle_1gpu[CodeLlama-7b-Instruct-eagle1]
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle1]
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle1]
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle1]
|
||||
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle1]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle1]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle1]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle1]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle1]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle1]
|
||||
examples/test_eagle.py::test_codellama_eagle_1gpu[CodeLlama-7b-Instruct-eagle2]
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle2]
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle2]
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle2]
|
||||
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle2]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle2]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle2]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle2]
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle2]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2]
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2]
|
||||
llmapi/test_llm_examples.py::test_llmapi_server_example
|
||||
|
||||
# e2e serve test
|
||||
examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options]
|
||||
examples/serve/test_serve.py::test_config_file_loading[--config]
|
||||
examples/serve/test_serve.py::test_env_overrides_pdl
|
||||
@ -827,8 +402,7 @@ examples/serve/test_serve_negative.py::test_malformed_json_request
|
||||
examples/serve/test_serve_negative.py::test_missing_content_type_header
|
||||
examples/serve/test_serve_negative.py::test_extremely_large_batch
|
||||
|
||||
|
||||
# PyTorch flow disaggregated tests
|
||||
# e2e disaggregated serving test
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
|
||||
|
||||
@ -1,39 +1,53 @@
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp1]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||
# text generation accuracy test
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
|
||||
@ -61,20 +75,28 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[disable
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
|
||||
@ -122,117 +144,29 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
|
||||
# multimodal accuracy tests
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
|
||||
@ -240,28 +174,41 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin]
|
||||
disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin]
|
||||
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin]
|
||||
disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin]
|
||||
# disaggregated serving accuracy test
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp1]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
|
||||
|
||||
# e2e test
|
||||
test_e2e.py::test_openai_chat_harmony
|
||||
test_e2e.py::test_openai_consistent_chat
|
||||
test_e2e.py::test_openai_multi_chat_example
|
||||
@ -291,3 +238,27 @@ test_e2e.py::test_trtllm_multimodal_benchmark_serving
|
||||
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf-Qwen3/qwen3-235B-eagle3]
|
||||
test_e2e.py::test_eagle3_output_consistency_4gpus[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-Llama-4-Maverick-17B-128E-Eagle3]
|
||||
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-Qwen3/qwen3-235B-eagle3]
|
||||
|
||||
# e2e disaggregated serving test
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin]
|
||||
disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin]
|
||||
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin]
|
||||
disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin]
|
||||
|
||||
@ -179,7 +179,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True]
|
||||
|
||||
@ -9,7 +9,6 @@ llm_perf_sanity:
|
||||
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 6: H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
# ===============================================================================
|
||||
|
||||
# 1: All GPUs
|
||||
@ -31,6 +30,7 @@ llm_perf_sanity:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[starcoder2_7b-bench-pytorch-bfloat16-input_output_len:512,512]
|
||||
# Phi-4-multimodal-instruct
|
||||
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
# Bielik-11B-v2.2-Instruct
|
||||
@ -124,25 +124,9 @@ llm_perf_sanity:
|
||||
# for chunked prefill cases
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
|
||||
# disagg server cases
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
# gpt_oss_20b_fp4
|
||||
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
|
||||
|
||||
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
compute_capability:
|
||||
gte: 9.0
|
||||
lt: 12.0
|
||||
|
||||
tests:
|
||||
# chunked attention case
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]
|
||||
|
||||
@ -82,6 +82,7 @@ l0_b200:
|
||||
- unittest/_torch/modeling -k "modeling_llama"
|
||||
- unittest/_torch/modeling -k "modeling_mixtral"
|
||||
- unittest/_torch/modeling -k "modeling_gpt_oss"
|
||||
- unittest/_torch/modeling/test_modeling_exaone_moe.py
|
||||
- unittest/tools/test_layer_wise_benchmarks.py::test_deepseek_r1_ctx_dep[1]
|
||||
- unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1]
|
||||
- unittest/_torch/modeling/test_modeling_exaone4.py::TestEXAONE4::test_llm_load_1_FP8
|
||||
|
||||
@ -43,6 +43,7 @@ l0_dgx_h100:
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-False]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
- unittest/llmapi/apps/test_disagg_serving_perf_metrics.py
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16]
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
|
||||
# llmapi
|
||||
|
||||
@ -42,5 +42,4 @@ l0_perf:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-_autodeploy-float16-input_output_len:128,128-reqs:8192]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-input_output_len:1024,1024-reqs:512]
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
examples/test_openai.py::test_llm_openai_triton_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654)
|
||||
examples/test_openai.py::test_llm_openai_triton_plugingen_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654)
|
||||
full:GH200/examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (arm is not supported)
|
||||
full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported)
|
||||
full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] SKIP (arm is not supported)
|
||||
full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported)
|
||||
@ -13,15 +12,10 @@ full:GH200/examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int
|
||||
perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
|
||||
perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
|
||||
perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
|
||||
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2 SKIP (not supported yet)
|
||||
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
|
||||
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
|
||||
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
|
||||
examples/test_qwen.py::test_llm_qwen1_5_moe_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/4781396)
|
||||
perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization)
|
||||
perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization)
|
||||
cpp/test_e2e.py::test_model[-encoder-90] SKIP (waive Encoder-only test because it doesn't take batched input)
|
||||
full:L40S/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] SKIP (skip on L40S commit f9a0fcb0)
|
||||
full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5014327)
|
||||
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] SKIP (https://nvbugs/5000026)
|
||||
@ -31,7 +25,6 @@ examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-full_prec] SKIP
|
||||
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8] SKIP (https://nvbugs/5000026)
|
||||
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec] SKIP (https://nvbugs/5000026)
|
||||
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq] SKIP (https://nvbugs/5000026)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5000026)
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
|
||||
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
|
||||
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
|
||||
@ -47,42 +40,15 @@ full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blac
|
||||
full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell)
|
||||
full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96)
|
||||
full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12)
|
||||
full:sm100/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (Disable for Blackwell OOM)
|
||||
full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM)
|
||||
full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12)
|
||||
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5064768)
|
||||
test_e2e.py::test_openai_consistent_chat SKIP (https://nvbugs/5112075)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle1] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2] SKIP (https://nvbugs/5206383)
|
||||
examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp16] SKIP (https://nvbugs/5114678)
|
||||
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] SKIP (https://nvbugs/5135328)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5141288)
|
||||
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5176867)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2 SKIP (https://nvbugs/5176867)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights SKIP (https://nvbugs/5176867)
|
||||
full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
|
||||
full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
|
||||
full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)
|
||||
full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
|
||||
full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
|
||||
full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
|
||||
@ -106,10 +72,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-rec
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
|
||||
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697)
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle1] SKIP (https://nvbugs/5219535)
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle1] SKIP (https://nvbugs/5219535)
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle1] SKIP (https://nvbugs/5219535)
|
||||
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle2] SKIP (https://nvbugs/5219535)
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
|
||||
perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
|
||||
perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
|
||||
@ -132,15 +94,10 @@ full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[deepseek
|
||||
full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405)
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
|
||||
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058)
|
||||
full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] SKIP (https://nvbugs/5273695)
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
|
||||
triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
|
||||
triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
|
||||
@ -186,33 +143,19 @@ examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552)
|
||||
unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958)
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
|
||||
examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] SKIP (https://nvbugs/5435714)
|
||||
test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075)
|
||||
examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488)
|
||||
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043)
|
||||
full:L40S/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
|
||||
full:L20/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
|
||||
test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8] SKIP (https://nvbugs/5380570)
|
||||
test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-8] SKIP (https://nvbugs/5380570)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
|
||||
test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288)
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067)
|
||||
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068)
|
||||
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5421989)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5421989)
|
||||
examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132)
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache SKIP (https://nvbugs/5433541)
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugs/5433541)
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
|
||||
examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
|
||||
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627)
|
||||
@ -226,14 +169,6 @@ triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (http
|
||||
triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
|
||||
triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
|
||||
triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] SKIP (https://nvbugs/5465143)
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKIP (https://nvbugs/5465143)
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] SKIP (https://nvbugs/5465143)
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-4-mini-instruct] SKIP (https://nvbugs/5465143)
|
||||
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696)
|
||||
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl SKIP (https://nvbugs/5413362)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146)
|
||||
triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392)
|
||||
@ -241,17 +176,6 @@ triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5
|
||||
triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378)
|
||||
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5477421)
|
||||
test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5448462)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5448462)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5448479)
|
||||
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8] SKIP (https://nvbugs/5465143)
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16] SKIP (https://nvbugs/5465143)
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] SKIP (https://nvbugs/5465143)
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
|
||||
accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5465143, 5481206 WNF)
|
||||
accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5465143, 5481206 WNF)
|
||||
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075)
|
||||
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143, 5481206 WNF)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5738168)
|
||||
test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319)
|
||||
@ -260,10 +184,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
|
||||
examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507)
|
||||
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507)
|
||||
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507)
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] SKIP (https://nvbugs/5546507)
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507)
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] SKIP (https://nvbugs/5546507)
|
||||
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507)
|
||||
cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689)
|
||||
cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689)
|
||||
full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
|
||||
@ -278,26 +198,9 @@ full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_f
|
||||
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
|
||||
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
|
||||
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
|
||||
full:GB200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] SKIP (https://nvbugs/5568052)
|
||||
full:GB200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] SKIP (https://nvbugs/5568052)
|
||||
full:GB200/examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] SKIP (https://nvbugs/5568052)
|
||||
full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
|
||||
full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
|
||||
full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
|
||||
full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] SKIP (https://nvbugs/5568052)
|
||||
full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] SKIP (https://nvbugs/5568052)
|
||||
full:B200/examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] SKIP (https://nvbugs/5568052)
|
||||
full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
|
||||
full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
|
||||
full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
|
||||
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (https://nvbugs/5451207)
|
||||
accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 SKIP (https://nvbugs/5511944)
|
||||
triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] SKIP (https://nvbugs/5470830)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search SKIP (https://nvbugs/5481075)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP (https://nvbugs/5568052)
|
||||
full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337)
|
||||
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847)
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143)
|
||||
unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
|
||||
triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
|
||||
full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
|
||||
@ -317,14 +220,6 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5655832)
|
||||
test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825)
|
||||
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904)
|
||||
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] SKIP (https://nvbugs/5664904)
|
||||
@ -335,9 +230,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5756804)
|
||||
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826)
|
||||
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] SKIP (https://nvbugs/5451216)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype SKIP (https://nvbugs/5588376)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527)
|
||||
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
|
||||
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
|
||||
@ -361,9 +254,6 @@ accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/570
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] SKIP (https://nvbugs/5705194)
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2 SKIP (https://nvbugs/5705195)
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8] SKIP (https://nvbugs/5666826)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/5707087)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype SKIP (https://nvbugs/5707087)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype SKIP (https://nvbugs/5707087)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145)
|
||||
@ -387,35 +277,18 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_tr
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304)
|
||||
unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392)
|
||||
unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476)
|
||||
examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/5744293)
|
||||
examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293)
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377)
|
||||
test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5744432)
|
||||
test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920)
|
||||
test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938)
|
||||
triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981)
|
||||
unittest/llmapi/test_llm_pytorch.py::test_tinyllama_logits_processor[False] SKIP (https://nvbugs/5771838)
|
||||
unittest/llmapi/test_llm_pytorch.py::test_tinyllama_logits_processor[True] SKIP (https://nvbugs/5771838)
|
||||
accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_cli_flow.py::TestPhi2::test_tp2 SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
|
||||
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
|
||||
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
|
||||
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5754976)
|
||||
examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (https://nvbugs/5754976)
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5568052)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/5588376)
|
||||
unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698)
|
||||
triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
|
||||
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737)
|
||||
@ -462,6 +335,7 @@ accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
|
||||
@ -475,7 +349,6 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
|
||||
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)
|
||||
unittest/_torch/attention/test_flashinfer_star_attn.py::TestStarAttention::test_flashinfer_star_attention[num_layers:2-num_heads:32-num_kv_heads:8-head_dim:64-anchor_size:64-block_size:64-dtype:torch.float16] SKIP (https://nvbugs/5781389)
|
||||
unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:16-hidden:128] SKIP (https://nvbugs/5781383)
|
||||
cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
|
||||
@ -492,11 +365,9 @@ unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_in
|
||||
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028)
|
||||
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] SKIP (https://nvbugs/5784518)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5785465)
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5785485)
|
||||
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5787855)
|
||||
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] SKIP (https://nvbugs/5787855)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/5787836)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836)
|
||||
@ -515,3 +386,11 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (http
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5800591)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5800646)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] SKIP (https://nvbugs/5800679)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5741304)
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/5800725)
|
||||
examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
server_configs:
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -31,7 +31,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -62,7 +62,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -97,7 +97,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -128,7 +128,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -159,7 +159,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -194,7 +194,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -225,7 +225,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -256,7 +256,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -291,7 +291,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -322,7 +322,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -353,7 +353,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -388,7 +388,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -419,7 +419,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -450,7 +450,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -481,7 +481,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -512,7 +512,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -543,7 +543,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -578,7 +578,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -613,7 +613,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -648,7 +648,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -683,7 +683,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -718,7 +718,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -753,7 +753,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -788,7 +788,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -823,7 +823,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -858,7 +858,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -893,7 +893,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -928,7 +928,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -963,7 +963,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -998,7 +998,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1033,7 +1033,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1068,7 +1068,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1103,7 +1103,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1138,7 +1138,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1173,7 +1173,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1208,7 +1208,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1243,7 +1243,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1278,7 +1278,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1313,7 +1313,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1348,7 +1348,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1383,7 +1383,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1418,7 +1418,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1453,7 +1453,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1488,7 +1488,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1523,7 +1523,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1558,7 +1558,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1593,7 +1593,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1628,7 +1628,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1663,7 +1663,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1698,7 +1698,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1733,7 +1733,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1768,7 +1768,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1803,7 +1803,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
server_configs:
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -31,7 +31,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -62,7 +62,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -93,7 +93,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -124,7 +124,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -155,7 +155,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -190,7 +190,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -224,7 +224,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -258,7 +258,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -292,7 +292,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -326,7 +326,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -360,7 +360,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -394,7 +394,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -428,7 +428,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -462,7 +462,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -496,7 +496,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -530,7 +530,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -564,7 +564,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -598,7 +598,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -632,7 +632,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -666,7 +666,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -700,7 +700,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -734,7 +734,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -768,7 +768,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -802,7 +802,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -836,7 +836,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -870,7 +870,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -904,7 +904,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -938,7 +938,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -972,7 +972,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1006,7 +1006,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1040,7 +1040,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1074,7 +1074,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1108,7 +1108,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1142,7 +1142,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1176,7 +1176,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1210,7 +1210,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1244,7 +1244,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1278,7 +1278,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1312,7 +1312,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1346,7 +1346,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1380,7 +1380,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
|
||||
@ -131,7 +131,7 @@ server_configs:
|
||||
iterations: 5
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
|
||||
@ -161,7 +161,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||
@ -191,7 +191,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k8k configs
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def wait_for_endpoint_ready(url: str, timeout: int = 300):
|
||||
def wait_for_endpoint_ready(url: str, timeout: int = 300, server_proc: subprocess.Popen = None):
|
||||
start = time.monotonic()
|
||||
while time.monotonic() - start < timeout:
|
||||
if server_proc is not None:
|
||||
exit_code = server_proc.poll()
|
||||
if exit_code is not None:
|
||||
raise RuntimeError(
|
||||
f"Server process exited with code {exit_code} before becoming ready."
|
||||
)
|
||||
try:
|
||||
time.sleep(1)
|
||||
if requests.get(url, timeout=5).status_code == 200:
|
||||
|
||||
406
tests/unittest/_torch/modeling/test_modeling_exaone_moe.py
Normal file
406
tests/unittest/_torch/modeling/test_modeling_exaone_moe.py
Normal file
@ -0,0 +1,406 @@
|
||||
import unittest
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from _torch.helpers import create_mock_cuda_graph_runner
|
||||
from parameterized import parameterized
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
|
||||
from tensorrt_llm._torch.metadata import KVCacheParams
|
||||
from tensorrt_llm._torch.model_config import ModelConfig
|
||||
from tensorrt_llm._torch.models.modeling_exaone_moe import ExaoneMoeForCausalLM
|
||||
from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
|
||||
from tensorrt_llm.bindings.executor import KvCacheConfig
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
|
||||
from utils.util import getSMVersion # isort: skip
|
||||
|
||||
# fmt: off
|
||||
# TODO: Remove this once we have a proper transformers package
|
||||
from tensorrt_llm._torch.models.modeling_exaone_moe import ExaoneMoEConfig # isort: skip
|
||||
|
||||
SKIP_EXAONE_MOE_HF_ACCURACY_TEST = False
|
||||
try:
|
||||
from transformers.models.exaone_moe.modeling_exaone_moe import (
|
||||
ExaoneMoEForCausalLM as HFExaoneMoEForCausalLM,
|
||||
)
|
||||
except ImportError:
|
||||
# TODO: Remove this once we have a proper config for EXAONE-MoE
|
||||
SKIP_EXAONE_MOE_HF_ACCURACY_TEST = True
|
||||
# fmt: on
|
||||
|
||||
WINDOW_SIZE = 4
|
||||
NUM_HIDDEN_LAYERS = 4
|
||||
|
||||
EXAONE_MOE_CONFIG = {
|
||||
"architectures": ["ExaoneMoEForCausalLM"],
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 1,
|
||||
"dtype": "bfloat16",
|
||||
"eos_token_id": 53,
|
||||
"first_last_k_dense_replace": 1,
|
||||
"head_dim": 128,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 6144,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 18432,
|
||||
"is_moe_layer": [False] + [True] * (NUM_HIDDEN_LAYERS - 1),
|
||||
"layer_types": [
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"sliding_attention",
|
||||
"full_attention",
|
||||
],
|
||||
"max_position_embeddings": 262144,
|
||||
"model_type": "exaone_moe",
|
||||
"moe_intermediate_size": 2048,
|
||||
"n_group": 1,
|
||||
"norm_topk_prob": True,
|
||||
"num_attention_heads": 64,
|
||||
"num_experts": 128,
|
||||
"num_experts_per_tok": 8,
|
||||
"num_hidden_layers": NUM_HIDDEN_LAYERS,
|
||||
"num_key_value_heads": 8,
|
||||
"num_shared_experts": 1,
|
||||
"pad_token_id": 0,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_scaling": None,
|
||||
"rope_theta": 1000000,
|
||||
"routed_scaling_factor": 2.5,
|
||||
"scoring_func": "sigmoid",
|
||||
"sliding_window": WINDOW_SIZE,
|
||||
"sliding_window_pattern": "LLLG",
|
||||
"tie_word_embeddings": False,
|
||||
"tokenizer_class": "GPT2Tokenizer",
|
||||
"topk_group": 1,
|
||||
"topk_method": "noaux_tc",
|
||||
"transformers_version": "5.0.0.dev0",
|
||||
"use_cache": True,
|
||||
"vocab_size": 153600,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(repr=False)
|
||||
class Scenario:
|
||||
attention_backend: str
|
||||
input_len: int = WINDOW_SIZE - 1
|
||||
use_cuda_graph: bool = False
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"attention_backend:{self.attention_backend.lower()}-"
|
||||
f"input_len:{self.input_len}-"
|
||||
f"use_cuda_graph:{self.use_cuda_graph}"
|
||||
)
|
||||
|
||||
|
||||
class TestExaoneMoe(unittest.TestCase):
|
||||
@parameterized.expand([None, "FP8"])
|
||||
def test_exaone_moe_sanity(self, quant_algo):
|
||||
"""Test basic EXAONE-MoE model forward pass with optional quantization."""
|
||||
|
||||
config_dict = deepcopy(EXAONE_MOE_CONFIG)
|
||||
exaone_moe_config = ExaoneMoEConfig.from_dict(config_dict)
|
||||
|
||||
if quant_algo:
|
||||
quant_config = QuantConfig(quant_algo=quant_algo)
|
||||
else:
|
||||
quant_config = QuantConfig()
|
||||
|
||||
if quant_algo == "FP8" and getSMVersion() < 89:
|
||||
self.skipTest("This test is not supported in pre-Ada architecture")
|
||||
|
||||
dtype = exaone_moe_config.torch_dtype
|
||||
device = torch.device("cuda")
|
||||
|
||||
model_config = ModelConfig(pretrained_config=exaone_moe_config, quant_config=quant_config)
|
||||
exaone_moe = ExaoneMoeForCausalLM(model_config).to(device)
|
||||
|
||||
input_ids = torch.tensor(
|
||||
[100, 200, 300, 100, 200, 100, 400, 500], dtype=torch.int, device=device
|
||||
)
|
||||
|
||||
context_sequence_lengths = [3, 2, 1]
|
||||
sequence_lengths = context_sequence_lengths + [1, 1]
|
||||
past_seen_tokens = [0, 0, 0, 62, 75]
|
||||
request_ids = list(range(len(sequence_lengths)))
|
||||
token_nums = (torch.tensor(past_seen_tokens) + torch.tensor(sequence_lengths)).tolist()
|
||||
prompt_lens = token_nums[:3] + past_seen_tokens[3:]
|
||||
|
||||
num_blocks = 100
|
||||
tokens_per_block = 128
|
||||
head_dim = exaone_moe.config.hidden_size // exaone_moe.config.num_attention_heads
|
||||
num_layers = exaone_moe.config.num_hidden_layers
|
||||
num_kv_heads = exaone_moe.config.num_key_value_heads
|
||||
max_seq_len = num_blocks * tokens_per_block
|
||||
batch_size = len(context_sequence_lengths) + 2
|
||||
|
||||
if dtype == torch.half:
|
||||
kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
|
||||
elif dtype == torch.bfloat16:
|
||||
kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
|
||||
else:
|
||||
raise ValueError("Invalid dtype")
|
||||
|
||||
mapping = Mapping(world_size=1, tp_size=1, rank=0)
|
||||
kv_cache_config = KvCacheConfig(max_tokens=num_blocks * tokens_per_block)
|
||||
kv_cache_manager = KVCacheManager(
|
||||
kv_cache_config,
|
||||
tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
|
||||
num_layers=num_layers,
|
||||
num_kv_heads=num_kv_heads,
|
||||
head_dim=head_dim,
|
||||
tokens_per_block=tokens_per_block,
|
||||
max_seq_len=max_seq_len,
|
||||
max_batch_size=batch_size,
|
||||
mapping=mapping,
|
||||
dtype=kv_cache_dtype,
|
||||
)
|
||||
kv_cache_manager.add_dummy_requests(request_ids, token_nums)
|
||||
|
||||
metadata_cls = get_attention_backend(model_config.attn_backend).Metadata
|
||||
attn_metadata = metadata_cls(
|
||||
seq_lens=torch.tensor(sequence_lengths, dtype=torch.int),
|
||||
num_contexts=len(context_sequence_lengths),
|
||||
kv_cache_params=KVCacheParams(
|
||||
use_cache=True,
|
||||
num_cached_tokens_per_seq=past_seen_tokens,
|
||||
),
|
||||
kv_cache_manager=kv_cache_manager,
|
||||
request_ids=request_ids,
|
||||
prompt_lens=prompt_lens,
|
||||
max_num_requests=len(context_sequence_lengths) + 2,
|
||||
max_num_tokens=8192,
|
||||
)
|
||||
|
||||
position_ids = []
|
||||
for i, tokens in enumerate(past_seen_tokens):
|
||||
seq_len = context_sequence_lengths[i] if i < len(context_sequence_lengths) else 1
|
||||
position_id = torch.arange(tokens, tokens + seq_len, device=input_ids.device)
|
||||
position_ids.append(position_id)
|
||||
|
||||
position_ids = torch.cat(position_ids).unsqueeze(0)
|
||||
|
||||
with torch.inference_mode():
|
||||
attn_metadata.prepare()
|
||||
logits = exaone_moe.forward(
|
||||
input_ids=input_ids, position_ids=position_ids, attn_metadata=attn_metadata
|
||||
)
|
||||
|
||||
self.assertEqual(len(past_seen_tokens), logits.shape[0])
|
||||
|
||||
with torch.inference_mode():
|
||||
attn_metadata.prepare()
|
||||
logits = exaone_moe.forward(
|
||||
input_ids=input_ids,
|
||||
position_ids=position_ids,
|
||||
attn_metadata=attn_metadata,
|
||||
return_context_logits=True,
|
||||
)
|
||||
self.assertEqual(input_ids.shape, logits.shape[:-1])
|
||||
|
||||
kv_cache_manager.shutdown()
|
||||
|
||||
def test_exaone_moe_moe_layer_config(self):
|
||||
"""Test that MoE layers are correctly configured."""
|
||||
config_dict = deepcopy(EXAONE_MOE_CONFIG)
|
||||
exaone_moe_config = ExaoneMoEConfig.from_dict(config_dict)
|
||||
|
||||
device = torch.device("cuda")
|
||||
model_config = ModelConfig(pretrained_config=exaone_moe_config)
|
||||
exaone_moe = ExaoneMoeForCausalLM(model_config).to(device)
|
||||
|
||||
# Verify MoE layer configuration
|
||||
is_moe_layer = config_dict["is_moe_layer"]
|
||||
self.assertEqual(len(is_moe_layer), NUM_HIDDEN_LAYERS)
|
||||
self.assertFalse(is_moe_layer[0]) # First layer should be dense
|
||||
for i in range(1, NUM_HIDDEN_LAYERS):
|
||||
self.assertTrue(is_moe_layer[i]) # Rest should be MoE
|
||||
|
||||
# Verify model has correct number of layers
|
||||
self.assertEqual(len(exaone_moe.model.layers), NUM_HIDDEN_LAYERS)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
Scenario(attention_backend="TRTLLM", input_len=WINDOW_SIZE - 2),
|
||||
Scenario(attention_backend="TRTLLM", input_len=WINDOW_SIZE - 2, use_cuda_graph=True),
|
||||
],
|
||||
lambda testcase_func, param_num, param: f"{testcase_func.__name__}[{param.args[0]}]",
|
||||
)
|
||||
@torch.no_grad()
|
||||
def test_exaone_moe_allclose_to_hf(self, scenario: Scenario) -> None:
|
||||
"""Compare output to HuggingFace implementation."""
|
||||
if SKIP_EXAONE_MOE_HF_ACCURACY_TEST:
|
||||
self.skipTest("EXAONE-MoE HF model is not available in this environment")
|
||||
|
||||
attention_backend = scenario.attention_backend
|
||||
metadata_cls = get_attention_backend(attention_backend).Metadata
|
||||
|
||||
torch.random.manual_seed(0)
|
||||
config_dict = deepcopy(EXAONE_MOE_CONFIG)
|
||||
exaone_moe_config = ExaoneMoEConfig.from_dict(config_dict)
|
||||
dtype = exaone_moe_config.torch_dtype
|
||||
device = torch.device("cuda")
|
||||
|
||||
hf_exaone_moe = HFExaoneMoEForCausalLM(exaone_moe_config).to(dtype).to(device).eval()
|
||||
|
||||
model_config = ModelConfig(
|
||||
pretrained_config=exaone_moe_config, attn_backend=attention_backend
|
||||
)
|
||||
exaone_moe = ExaoneMoeForCausalLM(model_config).to(dtype).to(device)
|
||||
exaone_moe.load_weights(hf_exaone_moe.state_dict())
|
||||
exaone_moe.post_load_weights()
|
||||
|
||||
num_blocks = 1
|
||||
tokens_per_block = 128
|
||||
head_dim = getattr(
|
||||
exaone_moe.config,
|
||||
"head_dim",
|
||||
exaone_moe.config.hidden_size // exaone_moe.config.num_attention_heads,
|
||||
)
|
||||
num_layers = exaone_moe.config.num_hidden_layers
|
||||
num_kv_heads = exaone_moe.config.num_key_value_heads
|
||||
max_seq_len = num_blocks * tokens_per_block
|
||||
batch_size = 1
|
||||
|
||||
if dtype == torch.half:
|
||||
kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
|
||||
elif dtype == torch.bfloat16:
|
||||
kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
|
||||
else:
|
||||
raise ValueError("Invalid dtype")
|
||||
|
||||
mapping = Mapping(world_size=1, tp_size=1, rank=0)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False,
|
||||
enable_partial_reuse=False,
|
||||
copy_on_partial_reuse=False,
|
||||
max_attention_window=[int(exaone_moe_config.sliding_window)],
|
||||
max_tokens=num_blocks * tokens_per_block,
|
||||
)
|
||||
kv_cache_manager = KVCacheManager(
|
||||
kv_cache_config,
|
||||
tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
|
||||
num_layers=num_layers,
|
||||
num_kv_heads=num_kv_heads,
|
||||
head_dim=head_dim,
|
||||
tokens_per_block=tokens_per_block,
|
||||
max_seq_len=max_seq_len,
|
||||
max_batch_size=batch_size,
|
||||
mapping=mapping,
|
||||
dtype=kv_cache_dtype,
|
||||
)
|
||||
|
||||
# Context phase
|
||||
input_ids = torch.tensor(
|
||||
[i * 100 for i in range(1, scenario.input_len + 1)], dtype=torch.int32, device=device
|
||||
)
|
||||
|
||||
num_cached_tokens_per_seq = [0]
|
||||
request_ids = [1]
|
||||
token_nums = [input_ids.size(-1)]
|
||||
prompt_lens = [input_ids.size(-1)]
|
||||
kv_cache_manager.add_dummy_requests(request_ids, token_nums)
|
||||
|
||||
attn_metadata = metadata_cls(
|
||||
seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.int),
|
||||
num_contexts=1,
|
||||
kv_cache_params=KVCacheParams(
|
||||
use_cache=True,
|
||||
num_cached_tokens_per_seq=num_cached_tokens_per_seq,
|
||||
),
|
||||
max_num_requests=1,
|
||||
max_num_tokens=8192,
|
||||
kv_cache_manager=kv_cache_manager,
|
||||
request_ids=request_ids,
|
||||
prompt_lens=prompt_lens,
|
||||
)
|
||||
|
||||
position_ids = [torch.arange(0, input_ids.size(-1), dtype=torch.int32)]
|
||||
position_ids = torch.cat(position_ids).unsqueeze(0).cuda()
|
||||
|
||||
with torch.inference_mode():
|
||||
attn_metadata.prepare()
|
||||
logits = exaone_moe.forward(
|
||||
input_ids=input_ids, position_ids=position_ids, attn_metadata=attn_metadata
|
||||
)
|
||||
ref = hf_exaone_moe.forward(
|
||||
input_ids=input_ids.unsqueeze(0), position_ids=position_ids, use_cache=True
|
||||
)
|
||||
|
||||
# MoE models may have slightly higher tolerance due to expert routing
|
||||
torch.testing.assert_close(logits, ref.logits[:, -1].float(), atol=0.5, rtol=0.5)
|
||||
|
||||
# Generation phase
|
||||
gen_input_ids = torch.tensor([600], dtype=torch.int32, device=device)
|
||||
num_cached_tokens_per_seq = [input_ids.size(-1)]
|
||||
|
||||
attn_metadata = metadata_cls(
|
||||
seq_lens=torch.tensor([gen_input_ids.size(-1)], dtype=torch.int),
|
||||
num_contexts=0,
|
||||
kv_cache_params=KVCacheParams(
|
||||
use_cache=True,
|
||||
num_cached_tokens_per_seq=num_cached_tokens_per_seq,
|
||||
),
|
||||
max_num_requests=1,
|
||||
max_num_tokens=8192,
|
||||
kv_cache_manager=kv_cache_manager,
|
||||
request_ids=request_ids,
|
||||
prompt_lens=prompt_lens,
|
||||
)
|
||||
|
||||
gen_position_ids = [
|
||||
torch.arange(
|
||||
input_ids.size(-1), input_ids.size(-1) + gen_input_ids.size(-1), dtype=torch.int32
|
||||
)
|
||||
]
|
||||
gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
|
||||
|
||||
graph_runner = create_mock_cuda_graph_runner(1) if scenario.use_cuda_graph else None
|
||||
|
||||
def run_forward(input_ids, position_ids, attn_metadata):
|
||||
attn_metadata.prepare()
|
||||
if not scenario.use_cuda_graph:
|
||||
return exaone_moe.forward(
|
||||
input_ids=input_ids, position_ids=position_ids, attn_metadata=attn_metadata
|
||||
)
|
||||
else:
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"position_ids": position_ids,
|
||||
"attn_metadata": attn_metadata,
|
||||
}
|
||||
key = (1, 0, False)
|
||||
graph_runner.capture(key, lambda inputs: exaone_moe.forward(**inputs), inputs)
|
||||
|
||||
for _ in range(2):
|
||||
attn_metadata.prepare()
|
||||
logits = graph_runner.replay(key, inputs)
|
||||
return logits
|
||||
|
||||
if scenario.use_cuda_graph:
|
||||
attn_metadata = attn_metadata.create_cuda_graph_metadata(1)
|
||||
|
||||
with torch.inference_mode():
|
||||
logits = run_forward(
|
||||
input_ids=gen_input_ids, position_ids=gen_position_ids, attn_metadata=attn_metadata
|
||||
)
|
||||
ref = hf_exaone_moe.forward(
|
||||
input_ids=gen_input_ids.unsqueeze(0),
|
||||
position_ids=gen_position_ids,
|
||||
past_key_values=ref.past_key_values,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(logits, ref.logits[:, -1].float(), atol=0.5, rtol=0.5)
|
||||
|
||||
if graph_runner is not None:
|
||||
graph_runner.clear()
|
||||
kv_cache_manager.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,12 +1,16 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
|
||||
from tensorrt_llm import MultimodalEncoder
|
||||
from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
|
||||
from tensorrt_llm.inputs import default_multimodal_input_loader
|
||||
from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
|
||||
from tensorrt_llm.llmapi.llm import LLM, SamplingParams
|
||||
@ -24,56 +28,127 @@ _QWEN_2_5_VL_DIR = llm_models_root() / "Qwen2.5-VL-3B-Instruct"
|
||||
_QWEN_3_VL_DIR = llm_models_root() / "Qwen3" / "Qwen3-VL-2B-Instruct"
|
||||
|
||||
|
||||
# TODO: Add multi-image in single chat test
|
||||
@pytest.mark.parametrize("model_dir",
|
||||
[_LLAVA_DIR, _QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR])
|
||||
@pytest.mark.parametrize("pd_disagg", [False, True])
|
||||
def test_single_image_chat(model_dir, pd_disagg):
|
||||
"""Test processing single image using encoder (pass mm_embeddings) + LLM API.
|
||||
@pytest.mark.parametrize(
|
||||
"prompts,expected_num_duplicates",
|
||||
[
|
||||
# Full reuse: same media + same prompts
|
||||
# All blocks are reused, thus no duplicates
|
||||
(["Describe the natural environment in the image."] * 2, 0),
|
||||
# Partial reuse: same media + different prompts
|
||||
# Prefix blocks are reused, thus 2 duplicates
|
||||
([
|
||||
"Describe the natural environment in the image.",
|
||||
"What objects can you see in the image?",
|
||||
"Describe the weather in the image.",
|
||||
], 2),
|
||||
])
|
||||
def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates):
|
||||
"""Test mm_keys in KV cache events with cache reuse scenarios.
|
||||
|
||||
This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
|
||||
results to standard llm generation (pass raw image) by comparing outputs.
|
||||
This test verifies:
|
||||
1. KV cache events contain mm_keys for multimodal blocks
|
||||
2. mm_keys have the expected structure (hash + start_offset)
|
||||
3. Cache reuse behavior based on media and prompts:
|
||||
- Same media + same prompts: full reuse (0 duplicate offsets)
|
||||
- Same media + different prompts: partial reuse (prefix blocks reused)
|
||||
"""
|
||||
encoder_model_dir = _LLAVA_DIR
|
||||
|
||||
# Test configuration
|
||||
max_tokens = 64
|
||||
max_tokens = 16
|
||||
free_gpu_memory_fraction = 0.2
|
||||
max_batch_size = 1
|
||||
|
||||
# Test data - OpenAI chat completion format
|
||||
prompts = ["Describe the natural environment in the image."]
|
||||
media = [example_images[0]]
|
||||
# Use same image for all prompts
|
||||
media = [example_images[0]] * len(prompts)
|
||||
|
||||
# Sampling configuration
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False,
|
||||
enable_block_reuse=True,
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
event_buffer_max_size=1024, # Enable KV cache events
|
||||
)
|
||||
|
||||
llm = LLM(model=encoder_model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=1)
|
||||
|
||||
inputs = _load_inputs(llm, prompts, media)
|
||||
|
||||
with llm:
|
||||
# Generate for each input separately to test KV cache reuse
|
||||
for inp in inputs:
|
||||
_ = llm.generate([inp], sampling_params=sampling_params)
|
||||
|
||||
time.sleep(0.5) # Wait for events to be dispatched
|
||||
events = llm.get_kv_cache_events(10)
|
||||
|
||||
# Extract mm_keys offsets from stored events
|
||||
mm_keys_offsets = []
|
||||
for event in events:
|
||||
if event and event.get("data", {}).get("type") == "stored":
|
||||
for block in event["data"].get("blocks", []):
|
||||
if block.get("mm_keys"):
|
||||
for mm_key in block["mm_keys"]:
|
||||
assert "hash" in mm_key, "mm_key should have 'hash' field"
|
||||
assert "start_offset" in mm_key, "mm_key should have 'start_offset' field"
|
||||
mm_keys_offsets.append(mm_key["start_offset"])
|
||||
|
||||
num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets))
|
||||
assert num_duplicates == expected_num_duplicates, (
|
||||
f"Expected {expected_num_duplicates} duplicate mm_keys offsets, "
|
||||
f"got {num_duplicates}. Offsets: {mm_keys_offsets}")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=[_LLAVA_DIR, _QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR],
|
||||
ids=["llava_7b", "qwen2.5_3b", "qwen3_2b"])
|
||||
def model_dir(request) -> Path:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def pd_disagg(request) -> bool:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llms(model_dir: Path,
|
||||
pd_disagg: bool) -> Generator[tuple[LLM, LLM | None], None, None]:
|
||||
"""Get LLM for prefill and, if disagg, separate LLM for decode."""
|
||||
free_gpu_memory_fraction = 0.2
|
||||
disable_overlap_scheduler = pd_disagg
|
||||
cache_transceiver_cfg = CacheTransceiverConfig(
|
||||
backend="DEFAULT") if pd_disagg else None
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False, # Disable for output 1:1 matching check
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
)
|
||||
|
||||
# Process multimodal data using encoder (pass mm_embeddings)
|
||||
encoder = MultimodalEncoder(model=model_dir, max_batch_size=max_batch_size)
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg,
|
||||
disable_overlap_scheduler=disable_overlap_scheduler,
|
||||
max_batch_size=1, # fix batch size to reduce non-determinism in tests
|
||||
)
|
||||
with llm:
|
||||
if pd_disagg:
|
||||
llm_decode = LLM(
|
||||
model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg,
|
||||
)
|
||||
with llm_decode:
|
||||
yield (llm, llm_decode)
|
||||
else:
|
||||
yield (llm, None)
|
||||
|
||||
cache_transceiver_cfg = CacheTransceiverConfig(
|
||||
backend="DEFAULT") if pd_disagg else None
|
||||
|
||||
disable_overlap_scheduler = pd_disagg
|
||||
|
||||
llm = LLM(model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg,
|
||||
disable_overlap_scheduler=disable_overlap_scheduler)
|
||||
|
||||
llm_decode = None
|
||||
if pd_disagg:
|
||||
llm_decode = LLM(model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg)
|
||||
|
||||
def _load_inputs(llm: LLM, prompts, media, mm_embeddings=None):
|
||||
# Load model configuration
|
||||
config_path = os.path.join(llm._hf_model_dir, 'config.json')
|
||||
assert os.path.exists(
|
||||
@ -90,11 +165,42 @@ def test_single_image_chat(model_dir, pd_disagg):
|
||||
modality="image",
|
||||
prompts=prompts,
|
||||
media=media,
|
||||
mm_embeddings=mm_embeddings,
|
||||
image_data_format="pt")
|
||||
|
||||
# Validate inputs structure
|
||||
assert len(inputs) == len(
|
||||
prompts), f"Expected {len(prompts)} inputs, got {len(inputs)}"
|
||||
return inputs
|
||||
|
||||
|
||||
# TODO: Add multi-image in single chat test
|
||||
@pytest.mark.threadleak(enabled=False)
|
||||
def test_single_image_chat(
|
||||
pd_disagg: bool,
|
||||
model_dir: Path,
|
||||
llms: tuple[LLM, LLM | None],
|
||||
):
|
||||
"""Test processing single image using encoder (pass mm_embeddings) + LLM API.
|
||||
|
||||
This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
|
||||
results to standard llm generation (pass raw image) by comparing outputs.
|
||||
"""
|
||||
llm, llm_decode = llms
|
||||
|
||||
# Test configuration
|
||||
max_tokens = 64
|
||||
max_batch_size = 1
|
||||
|
||||
# Test data - OpenAI chat completion format
|
||||
prompts = ["Describe the natural environment in the image."]
|
||||
media = [example_images[0]]
|
||||
|
||||
# Sampling configuration
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
|
||||
# Prepare multimodal inputs
|
||||
inputs = _load_inputs(llm, prompts, media)
|
||||
|
||||
# Generate reference output with raw multimodal inputs
|
||||
outputs_ref = llm.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
@ -109,33 +215,35 @@ def test_single_image_chat(model_dir, pd_disagg):
|
||||
) > 0, f"Reference generation has no output text for input {i}"
|
||||
|
||||
# Prepare inputs for llm (pass mm_embeddings)
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
# Process multimodal data using encoder (pass mm_embeddings)
|
||||
encoder = MultimodalEncoder(model=model_dir, max_batch_size=max_batch_size)
|
||||
with encoder:
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
|
||||
# Generate output using llm (pass mm_embeddings)
|
||||
ep_disaggregated_params = encoder_outputs[0].disaggregated_params
|
||||
# Generate output using llm (pass mm_embeddings)
|
||||
ep_disaggregated_params = encoder_outputs[0].disaggregated_params
|
||||
|
||||
assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
|
||||
ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
|
||||
assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
|
||||
ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=ep_disaggregated_params)
|
||||
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=ep_disaggregated_params)
|
||||
if pd_disagg:
|
||||
# Generation using llm_decode
|
||||
assert len(outputs) == 1
|
||||
pd_disaggregated_params = outputs[0].disaggregated_params
|
||||
pd_disaggregated_params.request_type = "generation_only"
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
# remove multimodal data from input as decoder worker doesn't need it
|
||||
inputs[0]['multi_modal_data'] = None
|
||||
# use prompt token ids from encoder output
|
||||
inputs[0]['prompt_token_ids'] = outputs[0].prompt_token_ids
|
||||
|
||||
if pd_disagg:
|
||||
# Generation using llm_decode
|
||||
assert len(outputs) == 1
|
||||
pd_disaggregated_params = outputs[0].disaggregated_params
|
||||
pd_disaggregated_params.request_type = "generation_only"
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
# remove multimodal data from input as decoder worker doesn't need it
|
||||
inputs[0]['multi_modal_data'] = None
|
||||
# use prompt token ids from encoder output
|
||||
inputs[0]['prompt_token_ids'] = outputs[0].prompt_token_ids
|
||||
|
||||
outputs = llm_decode.generate(
|
||||
inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=pd_disaggregated_params)
|
||||
outputs = llm_decode.generate(
|
||||
inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=pd_disaggregated_params)
|
||||
|
||||
# Validate outputs
|
||||
assert len(outputs) == len(
|
||||
@ -175,24 +283,37 @@ def test_single_image_chat(model_dir, pd_disagg):
|
||||
f"Log probabilities don't match for output {i}, generation {j}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_dir, encoder_max_batch_size",
|
||||
[
|
||||
(_LLAVA_DIR, 3),
|
||||
# Qwen2.5 VL's vision encoder seems to output different embeddings based on this value.
|
||||
# The test only passes with this set to 1.
|
||||
(_QWEN_2_5_VL_DIR, 1),
|
||||
(_QWEN_3_VL_DIR, 3),
|
||||
],
|
||||
)
|
||||
def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
|
||||
@pytest.mark.parametrize("use_mm_embeddings,pass_embeddings_through_loader",
|
||||
product([False, True], [False, True]))
|
||||
@pytest.mark.threadleak(enabled=False)
|
||||
def test_multi_request_batch_chat(
|
||||
model_dir: Path,
|
||||
llms: tuple[LLM, LLM | None],
|
||||
use_mm_embeddings: bool,
|
||||
pass_embeddings_through_loader: bool,
|
||||
):
|
||||
"""Test batching multiple multimodal requests and verify encoder path matches raw path.
|
||||
|
||||
This mirrors test_single_image_chat but with a batch of size 3.
|
||||
This mirrors test_single_image_chat but with a batch of size 3. It also tests passing
|
||||
embeddings alongside the prompt ("multi_modal_embeddings"), as well as the embedding
|
||||
handling within default_multimodal_input_loader.
|
||||
"""
|
||||
if use_mm_embeddings and model_dir in [_QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR]:
|
||||
pytest.skip("Qwen does not implement attach_multimodal_embeddings")
|
||||
|
||||
# Qwen2.5/3 VL's vision encoder seems to output different embeddings based on this value.
|
||||
# The test only passes with this set to 1.
|
||||
encoder_max_batch_size = (1 if model_dir
|
||||
in [_QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR] else 3)
|
||||
|
||||
llm, llm_decode = llms
|
||||
if llm_decode is not None:
|
||||
pytest.skip("Disagg support not implemented in test case")
|
||||
|
||||
if pass_embeddings_through_loader and not use_mm_embeddings:
|
||||
pytest.skip("Redundant test configuration")
|
||||
|
||||
max_tokens = 64
|
||||
free_gpu_memory_fraction = 0.6
|
||||
|
||||
prompts = [
|
||||
"Describe the natural environment in the image.",
|
||||
@ -202,37 +323,8 @@ def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
|
||||
media = [example_images[0], example_images[1], example_images[2]]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=
|
||||
False, # Disable block reuse for output 1-1 matching check
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
)
|
||||
|
||||
encoder = MultimodalEncoder(model=model_dir,
|
||||
max_batch_size=encoder_max_batch_size)
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=1, # fix batch size to reduce non-determinism in tests
|
||||
trust_remote_code=True)
|
||||
|
||||
config_path = os.path.join(llm._hf_model_dir, 'config.json')
|
||||
assert os.path.exists(
|
||||
config_path), f"Model config not found at {config_path}"
|
||||
with open(config_path, 'r') as f:
|
||||
model_config = json.load(f)
|
||||
model_type = model_config['model_type']
|
||||
|
||||
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
|
||||
model_dir=llm._hf_model_dir,
|
||||
model_type=model_type,
|
||||
modality="image",
|
||||
prompts=prompts,
|
||||
media=media,
|
||||
image_data_format="pt")
|
||||
assert len(inputs) == len(
|
||||
prompts), f"Expected {len(prompts)} inputs, got {len(inputs)}"
|
||||
inputs = _load_inputs(llm, prompts, media)
|
||||
|
||||
# Reference with raw inputs
|
||||
outputs_ref = llm.generate(inputs, sampling_params=sampling_params)
|
||||
@ -242,107 +334,74 @@ def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
|
||||
output.outputs
|
||||
) > 0, f"Reference generation has no output text for input {i}"
|
||||
|
||||
# Encoder path
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
for eo in encoder_outputs:
|
||||
eo.disaggregated_params.request_type = "context_and_generation"
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=[
|
||||
eo.disaggregated_params for eo in encoder_outputs
|
||||
])
|
||||
encoder = MultimodalEncoder(model=model_dir,
|
||||
max_batch_size=encoder_max_batch_size)
|
||||
with encoder:
|
||||
# Encoder path
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
if use_mm_embeddings:
|
||||
for input, encoder_output in zip(inputs, encoder_outputs):
|
||||
mm_embed_handle = encoder_output.mm_embedding_handle
|
||||
assert mm_embed_handle is not None
|
||||
mm_embed = SharedTensorContainer.from_dict(
|
||||
mm_embed_handle).get_local_view()
|
||||
input["multi_modal_embeddings"] = {"image": mm_embed}
|
||||
|
||||
assert len(outputs) == len(prompts)
|
||||
for i, output in enumerate(outputs):
|
||||
assert len(
|
||||
output.outputs) > 0, f"generation has no output text for input {i}"
|
||||
if pass_embeddings_through_loader:
|
||||
# Test embedding support in default_multimodal_input_loader
|
||||
inputs_with_embeddings = _load_inputs(
|
||||
llm,
|
||||
prompts,
|
||||
media=None,
|
||||
mm_embeddings=[
|
||||
input["multi_modal_embeddings"]["image"]
|
||||
for input in inputs
|
||||
],
|
||||
)
|
||||
for input, input_with_embedding in zip(inputs,
|
||||
inputs_with_embeddings):
|
||||
assert isinstance(input, dict)
|
||||
assert isinstance(input_with_embedding, dict)
|
||||
assert list(
|
||||
set(input.keys())
|
||||
^ set(input_with_embedding.keys())) == [
|
||||
"multi_modal_data"
|
||||
]
|
||||
assert set(input_with_embedding.keys()) == set(
|
||||
["prompt", "multi_modal_embeddings"])
|
||||
assert input["prompt"] == input_with_embedding["prompt"]
|
||||
assert list(
|
||||
input["multi_modal_embeddings"].keys()) == ["image"]
|
||||
assert list(input_with_embedding["multi_modal_embeddings"].
|
||||
keys()) == ["image"]
|
||||
mm_embed, = input_with_embedding["multi_modal_embeddings"][
|
||||
"image"]
|
||||
torch.testing.assert_close(
|
||||
mm_embed, input["multi_modal_embeddings"]["image"])
|
||||
inputs = inputs_with_embeddings # perform inference with embeddings returned by input loader
|
||||
|
||||
# Compare
|
||||
for i, (ref_output, test_output) in enumerate(zip(outputs_ref, outputs)):
|
||||
assert len(ref_output.outputs) == len(test_output.outputs), \
|
||||
f"Number of generated outputs don't match for output {i}: {len(ref_output.outputs)} vs {len(test_output.outputs)}"
|
||||
for j, (ref_gen, test_gen) in enumerate(
|
||||
zip(ref_output.outputs, test_output.outputs)):
|
||||
assert ref_gen.text == test_gen.text, \
|
||||
f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}"
|
||||
extra_kwargs = {}
|
||||
else:
|
||||
for eo in encoder_outputs:
|
||||
eo.disaggregated_params.request_type = "context_and_generation"
|
||||
extra_kwargs = dict(disaggregated_params=[
|
||||
eo.disaggregated_params for eo in encoder_outputs
|
||||
])
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**extra_kwargs)
|
||||
|
||||
assert len(outputs) == len(prompts)
|
||||
for i, output in enumerate(outputs):
|
||||
assert len(output.outputs
|
||||
) > 0, f"generation has no output text for input {i}"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompts,expected_num_duplicates",
|
||||
[
|
||||
# Full reuse: same media + same prompts
|
||||
# All blocks are reused, thus no duplicates
|
||||
(["Describe the natural environment in the image."] * 2, 0),
|
||||
# Partial reuse: same media + different prompts
|
||||
# Prefix blocks are reused, thus 2 duplicates
|
||||
([
|
||||
"Describe the natural environment in the image.",
|
||||
"What objects can you see in the image?",
|
||||
"Describe the weather in the image.",
|
||||
], 2),
|
||||
])
|
||||
def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates):
|
||||
"""Test mm_keys in KV cache events with cache reuse scenarios.
|
||||
|
||||
This test verifies:
|
||||
1. KV cache events contain mm_keys for multimodal blocks
|
||||
2. mm_keys have the expected structure (hash + start_offset)
|
||||
3. Cache reuse behavior based on media and prompts:
|
||||
- Same media + same prompts: full reuse (0 duplicate offsets)
|
||||
- Same media + different prompts: partial reuse (prefix blocks reused)
|
||||
"""
|
||||
encoder_model_dir = _LLAVA_DIR
|
||||
|
||||
max_tokens = 16
|
||||
free_gpu_memory_fraction = 0.6
|
||||
|
||||
# Use same image for all prompts
|
||||
media = [example_images[0]] * len(prompts)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
event_buffer_max_size=1024, # Enable KV cache events
|
||||
)
|
||||
|
||||
llm = LLM(model=encoder_model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=1)
|
||||
|
||||
config_path = os.path.join(llm._hf_model_dir, 'config.json')
|
||||
with open(config_path, 'r') as f:
|
||||
model_config = json.load(f)
|
||||
model_type = model_config['model_type']
|
||||
|
||||
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
|
||||
model_dir=llm._hf_model_dir,
|
||||
model_type=model_type,
|
||||
modality="image",
|
||||
prompts=prompts,
|
||||
media=media,
|
||||
image_data_format="pt")
|
||||
|
||||
# Generate for each input separately to test KV cache reuse
|
||||
for inp in inputs:
|
||||
_ = llm.generate([inp], sampling_params=sampling_params)
|
||||
|
||||
time.sleep(0.5) # Wait for events to be dispatched
|
||||
events = llm.get_kv_cache_events(10)
|
||||
|
||||
# Extract mm_keys offsets from stored events
|
||||
mm_keys_offsets = []
|
||||
for event in events:
|
||||
if event and event.get("data", {}).get("type") == "stored":
|
||||
for block in event["data"].get("blocks", []):
|
||||
if block.get("mm_keys"):
|
||||
for mm_key in block["mm_keys"]:
|
||||
assert "hash" in mm_key, "mm_key should have 'hash' field"
|
||||
assert "start_offset" in mm_key, "mm_key should have 'start_offset' field"
|
||||
mm_keys_offsets.append(mm_key["start_offset"])
|
||||
|
||||
num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets))
|
||||
assert num_duplicates == expected_num_duplicates, (
|
||||
f"Expected {expected_num_duplicates} duplicate mm_keys offsets, "
|
||||
f"got {num_duplicates}. Offsets: {mm_keys_offsets}")
|
||||
# Compare
|
||||
for i, (ref_output, test_output) in enumerate(zip(outputs_ref,
|
||||
outputs)):
|
||||
assert len(ref_output.outputs) == len(test_output.outputs), \
|
||||
f"Number of generated outputs don't match for output {i}: {len(ref_output.outputs)} vs {len(test_output.outputs)}"
|
||||
for j, (ref_gen, test_gen) in enumerate(
|
||||
zip(ref_output.outputs, test_output.outputs)):
|
||||
assert ref_gen.text == test_gen.text, \
|
||||
f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user