Merge branch 'main' into user/yuhangh/support_export_data_in_eval

This commit is contained in:
heyuhhh 2026-01-13 10:01:12 +08:00 committed by GitHub
commit 8d858f912e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
93 changed files with 4272 additions and 2081 deletions

View File

@ -648,7 +648,7 @@ public:
void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
[[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false);
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
@ -853,8 +853,8 @@ public:
//! \param blockKeys Key of each block.
//! \param blockIds Id of each block.
//! \param pinBlocks If true, increment ref count for blocks while storing (pin on store).
//! \return Pair of (num blocks stored for reuse, id of the last block stored if any).
[[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
//! \return Pair of (num blocks stored for reuse, vector of pinned block IDs).
[[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
bool pinBlocks = false);
@ -886,8 +886,8 @@ public:
[[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);
//! \brief Unpin blocks by starting from a block id and walking prev pointers.
void unpinBlocksById(KVCacheBlock::IdType blockId);
//! \brief Unpin blocks by block ids directly
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
{
@ -1103,7 +1103,7 @@ public:
std::optional<KVCacheBlock::IdType> releaseBlocks(
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
[[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
@ -1112,7 +1112,7 @@ public:
/// @param sequence The generation request whose blocks should be pinned.
void pinBlocks(GenerationRequest& sequence);
void unpinBlocksById(KVCacheBlock::IdType blockId);
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
@ -1133,7 +1133,7 @@ public:
void offloadBlock(BlockPtr const& block, SizeType32 windowSize,
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
[[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
[[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
SizeType32 windowSize, bool pinBlocks = false)
{
@ -1584,7 +1584,7 @@ public:
virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
/// \brief Store blocks for reuse for a given request id
[[nodiscard]] virtual std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
[[nodiscard]] virtual std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false)
= 0;
@ -1678,7 +1678,7 @@ public:
BlockKey const& blockKey, SizeType32 windowSize)
= 0;
virtual void unpinBlocksById(KVCacheBlock::IdType blockId) = 0;
virtual void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) = 0;
};
class KVCacheManager : public BaseKVCacheManager
@ -1939,7 +1939,7 @@ public:
//! \brief Store newest blocks for reuse
void storeNewBlock(LlmRequest const& llmRequest) override;
[[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false) override;
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
@ -1960,7 +1960,7 @@ public:
void pinBlocks(LlmRequest::RequestIdType requestId) override;
void unpinBlocksById(KVCacheBlock::IdType blockId) override;
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) override;
std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;

View File

@ -1667,6 +1667,12 @@ public:
[](auto reason) { return reason == executor::FinishReason::kLENGTH; });
}
[[nodiscard]] bool isFinishedDueToCancellation() const noexcept
{
return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),
[](auto reason) { return reason == executor::FinishReason::kCANCELLED; });
}
[[nodiscard]] bool isTimedOut() const
{
if (!mAllottedTimeMs.has_value())

View File

@ -129,6 +129,18 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
#define SLIDING_WINDOW 0
#endif
#ifndef SKIP_SOFTMAX_ATTN
#define SKIP_SOFTMAX_ATTN 0
#endif
#ifndef SKIP_SOFTMAX_ATTN_BLOCK_STATS
#define SKIP_SOFTMAX_ATTN_BLOCK_STATS 0
#endif
#ifndef SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
#define SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE 1
#endif
// 0 - no PDL
// 1 - naive PDL
// 2 - aggressive PDL (implemented only in mha_sm90.cu for now)

View File

@ -106,6 +106,7 @@ __device__ inline MatDesc makeMatDesc(void const* data, uint32_t dimKByteOffset,
asm volatile("trap;\n");
return 0;
}();
assert(__cvta_generic_to_shared(data) % baseAlign == 0);
uint32_t const baseOffset = ((patternAddr % baseAlign == 0) ? 0U : ((patternAddr >> 0x7) & 0x7));
return MatDesc{
/*addr=*/MatDesc::encode(__cvta_generic_to_shared(data)),

View File

@ -2734,6 +2734,25 @@ static constexpr auto kernel_mha = kernel_mha_impl;
#endif
#ifndef GENERATE_CUBIN
uint32_t computeNbSubSeqPerSeqMHA(cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen)
{
if (!allowMultiBlockMode)
{
return 1;
}
auto const env = std::getenv("XQA_NB_SUB_SEQ");
if (env != nullptr)
{
int32_t const val = std::stoi(env);
if (val > 0)
{
return val;
}
}
return std::min<uint32_t>(
std::max<uint32_t>(1U, prop.multiProcessorCount / (batchSize * nbKHeads)), divUp(maxSeqLen, ctaTile.x));
}
void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
#if SLIDING_WINDOW
uint32_t slidingWinSize,
@ -2771,6 +2790,13 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
// int8/fp8 KV cache.
#if SPEC_DEC
SpecDecParams const& specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
float const skipSoftmaxThresholdScaleFactor, // for compatibility with mha_sm90.cu only
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
uint32_t* __restrict__ skippedBlockCount, // for compatibility with mha_sm90.cu only
uint32_t* __restrict__ totalBlockCount, // for compatibility with mha_sm90.cu only
#endif
#endif
uint32_t* semaphores, void* scratch, cudaStream_t stream)
{
@ -2793,24 +2819,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
uint32_t const nbQHeads = nbKHeads * headGrpSize;
// const uint32_t nbSubSeqPerSeq = allowMultiBlockMode ? DBG_NB_CTAS_PER_SEQ : 1;
uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t
{
if (!allowMultiBlockMode)
{
return 1;
}
auto const env = std::getenv("XQA_NB_SUB_SEQ");
if (env != nullptr)
{
int32_t const val = std::stoi(env);
if (val > 0)
{
return val;
}
}
return std::min<uint32_t>(
std::max<uint32_t>(1U, prop.multiProcessorCount / (batchSize * nbKHeads)), divUp(maxSeqLen, ctaTile.x));
}();
uint32_t const nbSubSeqPerSeq = computeNbSubSeqPerSeqMHA(prop, batchSize, nbKHeads, maxSeqLen);
// gridDim.z == batchSize && gridDim.y == nbKHeads && gridDim.x == nbSubSeqPerSeq
#if SPEC_DEC
const uint32_t nbTokenBlocksPerGrp = divUp(qSeqLen * headGrpSize, rowsPerBlock);

View File

@ -90,6 +90,9 @@ struct BeamSearchParams
// match trt-llm API.
};
uint32_t computeNbSubSeqPerSeqMHA(
cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen);
void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
#if SLIDING_WINDOW
uint32_t slidingWinSize,
@ -127,9 +130,18 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
// int8/fp8 KV cache.
#if SPEC_DEC
SpecDecParams const& specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
float const skipSoftmaxThresholdScaleFactor,
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
#endif
#endif
uint32_t* semaphores, void* scratch, cudaStream_t stream);
uint32_t computeNbSubSeqPerSeqHopperF8MHA(
cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen);
void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
#if SLIDING_WINDOW
uint32_t slidingWinSize,
@ -167,6 +179,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
// int8/fp8 KV cache.
#if SPEC_DEC
SpecDecParams const& specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
float const skipSoftmaxThresholdScaleFactor,
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
#endif
#endif
uint32_t* semaphores, void* scratch, cudaStream_t stream);

View File

@ -49,6 +49,10 @@ static_assert(specDecQLen * headGrpSize <= 32, "SPEC_Q_SEQ_LEN macro value is to
#define SWAP_AB (!SPEC_DEC)
#endif
#if SKIP_SOFTMAX_ATTN
static_assert(SWAP_AB && USE_PAGED_KV_CACHE && !SPEC_DEC && BEAM_WIDTH == 1, "SKIP_SOFTMAX_ATTN is not supported.");
#endif
#define IS_SUPPORTED_F16_CASE (CACHE_ELEM_ENUM == 0 && !SPEC_DEC && SWAP_AB && !USE_INPUT_KV && !LOW_PREC_OUTPUT)
inline constexpr bool swapAB = SWAP_AB;
@ -138,26 +142,38 @@ using PaddedOutHead = PaddedInputHead;
struct alignas(128) SharedMem
{
using QBuffer = Vec<Array2D<LdGrain, ctaNbQHeads, grainsPerQPart>, nbQParts>;
using KBuffer = Array2D<LdGrain, gemm0CtaTileNbTokens, exactDiv(cacheHeadPartBytes, grainBytes)>;
static constexpr uint32_t nbKBuf = 2;
KBuffer k[nbKBuf]; // as is loaded from global mem.
using XBuffer = Vec<Array2D<LdGrain, ctaNbQHeads, grainsPerXPart>, nbXParts>;
static constexpr uint32_t nbXBuf
= 2 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens));
using VBuffer = Vec<Array2D<LdGrain, gemm1CtaTileNbTokens, exactDiv(cacheHeadPartBytes, grainBytes),
sizeof(XBuffer) % (cacheHeadPartBytes * 8) == 0>,
cacheHeadNbParts>;
#if !SWAP_AB
using VTBuffer = Array2D<LdGrain, headElems, exactDiv(gemm1CtaTileNbTokens, cacheElemsPerGrain), true>;
#endif
static constexpr uint32_t nbVBuf = 2;
#if CACHE_ELEM_ENUM == 0
using OutSwizzleBuf = Array2D<LdGrain, ctaNbQHeads, grainsPerPaddedInputHead>;
#elif CACHE_ELEM_ENUM == 2
using OutSwizzleBuf = Array2D<Vec<Vec<InputElem, 4>, 4>, ctaNbQHeads, exactDiv(headElems, 4 * 4)>;
#endif
#if SKIP_SOFTMAX_ATTN
static constexpr uint32_t nbKBuf = 2;
static constexpr uint32_t nbVBuf = 3; // @fixme: skip_softmax_attn: for skip softmax attn, an extra VBuffer is used
static constexpr uint32_t nbXBuf
= 3 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens));
#else
static constexpr uint32_t nbKBuf = 2;
static constexpr uint32_t nbVBuf = 2;
static constexpr uint32_t nbXBuf
= 2 * (gemm0CtaTileNbTokens >= gemm1CtaTileNbTokens ? 1 : exactDiv(gemm1CtaTileNbTokens, gemm0CtaTileNbTokens));
#endif
static_assert(nbXBuf == nbVBuf);
// note: buffers used for GMMA may have additional alignment requirements
KBuffer k[nbKBuf]; // as is loaded from global mem.
QBuffer q; // For gmma math. Conversion done if needed.
union ReusedXVOutSwizzleBuf
{
struct XV
@ -196,9 +212,6 @@ struct alignas(128) SharedMem
return reusedXVOutSwizzleBuf[i].outSwizzle;
}
using QBuffer = Vec<Array2D<LdGrain, ctaNbQHeads, grainsPerQPart>, nbQParts>;
QBuffer q; // For gmma math. Conversion done if needed.
// @fixme: move these into reusedXVOutSwizzleBuf
#if SWAP_AB
ShmQWiseVec xColMax[nbXBuf];
@ -220,6 +233,11 @@ struct alignas(128) SharedMem
Vec<KVCachePageIndex, nbPagesPerTile> pages[2]; // one for K and one for V
#endif
#if SKIP_SOFTMAX_ATTN
uint32_t skipSoftmaxVotesGemm0ToV[nbXBuf]; // guarded by skipSoftmaxXBar
uint32_t skipSoftmaxVotesGemm0ToGemm1[nbXBuf]; // guarded by xBar
#endif
// mem barriers
CtaBarrierPair qBar;
@ -229,6 +247,9 @@ struct alignas(128) SharedMem
CtaBarrierPair vtBar[nbVBuf];
#endif
CtaBarrierPair xBar[nbXBuf];
#if SKIP_SOFTMAX_ATTN
CtaBarrierPair skipSoftmaxXBar[nbXBuf]; // for V to wait for X to be ready
#endif
// used internally in the gemm0 warp group
// @fixme: use separate arrive and wait for all usage
@ -425,8 +446,13 @@ __device__ void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
#endif
#if SWAP_AB
#if SKIP_SOFTMAX_ATTN
__device__ RegColWiseVec computeWarpGrpColMax_sync(CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src,
float skipSoftmaxThreshold, uint32_t* smemSkipVote, bool maybeSkip);
#else
__device__ RegColWiseVec computeWarpGrpColMax_sync(
CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src);
#endif
__device__ void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32_t validRowBeg, uint32_t validRowEnd);
__device__ void warpGrpOnlineSoftmax(Gemm0Acc& acc, RegColWiseVec const& colMax);
__device__ RegColWiseVec computeWarpColSum(Gemm0Acc& src);
@ -675,6 +701,12 @@ CUBIN_EXPORT __global__
#endif
#if SPEC_DEC
SpecDecParams const specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
float const skipSoftmaxThresholdScaleFactor,
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
#endif
#endif
uint32_t* __restrict__ const semaphores
= nullptr, // [nbReq][nbKHeads][divUp(specDecParams.qSeqLen, inputTokensPerCta)]
@ -753,6 +785,10 @@ CUBIN_EXPORT __global__
uint32_t const nbSubSeq = isMultiBlockMode ? mha::min(nbTilesInUse / multiBlockMinNbTilesPerCta, maxNbSubSeq) : 1;
static_assert(multiBlockMinNbTiles >= multiBlockMinNbTilesPerCta * 2);
assert(isMultiBlockMode == (nbSubSeq > 1));
#if SKIP_SOFTMAX_ATTN
bool const disableSkipForShortSeq = (cacheSeqLen < skipSoftmaxThresholdScaleFactor);
float const skipSoftmaxThreshold = disableSkipForShortSeq ? 0.0f : skipSoftmaxThresholdScaleFactor / cacheSeqLen;
#endif
if (idxSubSeq >= nbSubSeq)
{
return;
@ -776,21 +812,34 @@ CUBIN_EXPORT __global__
assert(dynamicSmemSize() >= sizeof(SharedMem));
SharedMem& smem = *reinterpret_cast<SharedMem*>(&smemByteBuf[0]);
constexpr uint32_t nbBuffers = 2;
static_assert(nbBuffers == SharedMem::nbKBuf && nbBuffers == SharedMem::nbVBuf && nbBuffers == SharedMem::nbXBuf);
if (wid < nbBuffers)
constexpr uint32_t maxNbBuffers = (SharedMem::nbXBuf > SharedMem::nbVBuf) ? SharedMem::nbXBuf : SharedMem::nbVBuf;
static_assert(
maxNbBuffers >= SharedMem::nbKBuf && maxNbBuffers >= SharedMem::nbVBuf && maxNbBuffers >= SharedMem::nbXBuf);
if (wid < maxNbBuffers)
{
if (warpElectSync())
{
smem.kBar[wid].initialize(gemm0NbThrds, gemm0NbThrds + warp_size);
smem.vBar[wid].initialize(gemm1NbThrds, gemm1NbThrds + warp_size);
#if !SWAP_AB
smem.vtBar[wid].initialize(gemm1NbThrds * 2, gemm1NbThrds * 2);
if (wid < SharedMem::nbKBuf)
{
smem.kBar[wid].initialize(gemm0NbThrds, gemm0NbThrds + warp_size);
}
if (wid < SharedMem::nbXBuf)
{
#if SKIP_SOFTMAX_ATTN
smem.skipSoftmaxXBar[wid].initialize(gemm0NbThrds + warp_size, gemm0NbThrds + warp_size);
smem.vBar[wid].initialize(gemm1NbThrds + warp_size, gemm1NbThrds + warp_size);
#else
smem.vBar[wid].initialize(gemm1NbThrds, gemm1NbThrds + warp_size);
#endif
smem.xBar[wid].initialize(gemm0NbThrds + gemm1NbThrds, gemm0NbThrds + gemm1NbThrds);
#if !SWAP_AB
smem.vtBar[wid].initialize(gemm1NbThrds * 2, gemm1NbThrds * 2);
#endif
smem.xBar[wid].initialize(gemm0NbThrds + gemm1NbThrds, gemm0NbThrds + gemm1NbThrds);
}
}
}
else if (wid == nbBuffers)
else if (wid == maxNbBuffers)
{
if (warpElectSync())
{
@ -819,6 +868,10 @@ CUBIN_EXPORT __global__
SpecDec const specDec{specDecParams, idxReq, idxInputSubSeq, cacheSeqLen};
#endif
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
uint32_t localSkippedBlockCount = 0;
#endif
// QK gemm
constexpr uint32_t nbGmmaInstM = exactDiv(gemm0CtaTileNbTokens, gmma::instM);
using Acc = GmmaAcc<gemm0CtaTileNbTokens, ctaNbQHeads>;
@ -940,10 +993,39 @@ CUBIN_EXPORT __global__
}
}
#endif
uint32_t const idxXBuf = idxIter % SharedMem::nbXBuf;
auto& xBar = smem.xBar[idxXBuf];
// update colMax in shared mem and get a register copy
#if SWAP_AB
#if SKIP_SOFTMAX_ATTN
auto& skipSoftmaxXBar = smem.skipSoftmaxXBar[idxXBuf];
skipSoftmaxXBar.consumed.arrive_and_wait();
bool const maybeSkip = !disableSkipForShortSeq && idxIter != 0;
RegColWiseVec const colMax = computeWarpGrpColMax_sync(smem.gemm0WarpGrpBar, smem.gemm0CurrentSeqMax, acc,
skipSoftmaxThreshold, &smem.skipSoftmaxVotesGemm0ToV[idxXBuf], maybeSkip);
bool const shouldSkipSoftmaxAttn = static_cast<bool>(smem.skipSoftmaxVotesGemm0ToV[idxXBuf]);
unused(skipSoftmaxXBar.produced.arrive());
warpGrpOnlineSoftmax(acc, colMax);
if (shouldSkipSoftmaxAttn)
{
xBar.consumed.arrive_and_wait();
if (threadIdx.x == 0)
{
smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 1U;
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
localSkippedBlockCount++;
#endif
}
asm volatile("fence.proxy.async.shared::cta;\n"); // maybe not used
unused(xBar.produced.arrive());
continue;
}
#else
RegColWiseVec const colMax = computeWarpGrpColMax_sync(smem.gemm0WarpGrpBar, smem.gemm0CurrentSeqMax, acc);
warpGrpOnlineSoftmax(acc, colMax);
#endif
#else
RegRowWiseVec const rowMax = computeWarpGrpRowMax_sync(warpRank, smem.gemm0CurrentSeqMax, acc);
warpGrpOnlineSoftmax(acc, rowMax);
@ -959,8 +1041,6 @@ CUBIN_EXPORT __global__
// map 1 to fp8_max before conversion to fp8
acc = acc * kE4M3_MAX;
uint32_t const idxXBuf = idxIter % SharedMem::nbXBuf;
auto& xBar = smem.xBar[idxXBuf];
// @fixme: for fp16/bf16, try not to transpose acc here, and leave it to the next GEMM.
#if SWAP_AB
storeGemm0AccToShm(warpRank, laneId(), smem.xBuf(idxXBuf), xBar.consumed, acc);
@ -989,13 +1069,25 @@ CUBIN_EXPORT __global__
storeShmRowWiseVec(warpRank, smem.xRowMax[idxXBuf], rowMax);
storeShmRowWiseVec(warpRank, smem.xRowSum[idxXBuf], rowSum);
#endif
#if SKIP_SOFTMAX_ATTN
if (threadIdx.x == 0)
{
smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 0;
}
#endif
__syncwarp();
// the release semantics of arrive does not work for async consumers like gmma. additional fence is
// needed.
asm volatile("fence.proxy.async.shared::cta;\n");
unused(xBar.produced.arrive());
}
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
if (threadIdx.x == 0 && skippedBlockCount != nullptr && totalBlockCount != nullptr)
{
atomicAdd(skippedBlockCount, localSkippedBlockCount);
atomicAdd(totalBlockCount, nbIters);
}
#endif
unused(smem.qBar.consumed.arrive());
}
else if (warpIdx.z == 1)
@ -1043,216 +1135,231 @@ CUBIN_EXPORT __global__
uint32_t idxVTile = idxVTileInit + idxIter * nbSubSeq;
auto const idxVBuf = idxIter % SharedMem::nbVBuf;
auto const idxXBuf = idxVBuf;
auto& vBar = smem.vBar[idxVBuf];
arrive_tx_and_wait(vBar.produced, exactDiv(sizeof(SharedMem::VBuffer), gemm1NbThrds));
auto const& vBuf = smem.vBuf(idxVBuf);
#if !SWAP_AB
CtaBarrierPair& vtBar = smem.vtBar[idxVBuf];
auto& vtBuf = smem.vtBuf(idxVBuf);
vtBar.consumed.arrive_and_wait();
transposeVTile(warpRank, laneId(), vtBuf, vBuf);
vBar.consumed.arrive();
vtBar.produced.arrive();
#endif
auto& xBar = smem.xBar[idxXBuf];
auto& vBar = smem.vBar[idxVBuf];
auto const& vBuf = smem.vBuf(idxVBuf);
xBar.produced.arrive_and_wait();
#if SKIP_SOFTMAX_ATTN
bool shouldSkipSoftmaxAttn = smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf]; // guarded by xBar
if (shouldSkipSoftmaxAttn)
{
vBar.produced.arrive_and_wait();
}
#endif
#if SKIP_SOFTMAX_ATTN
if (!shouldSkipSoftmaxAttn) // skip XVGemm
#endif
{
arrive_tx_and_wait(vBar.produced, exactDiv(sizeof(SharedMem::VBuffer), gemm1NbThrds));
#if !SWAP_AB
CtaBarrierPair& vtBar = smem.vtBar[idxVBuf];
auto& vtBuf = smem.vtBuf(idxVBuf);
vtBar.consumed.arrive_and_wait();
transposeVTile(warpRank, laneId(), vtBuf, vBuf);
vBar.consumed.arrive();
vtBar.produced.arrive();
#endif
#if !defined(NDEBUG) && DBG_PRINT
#if SWAP_AB
if (threadIdx.x == 0)
{
printf("colMax:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xColMax[idxXBuf][i]);
}
printf("\n");
printf("colSum:\n");
for (int n = 0; n < 4; n++)
if (threadIdx.x == 0)
{
printf("colMax:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xColSum[idxXBuf][n][i]);
printf("%f, ", smem.xColMax[idxXBuf][i]);
}
printf("\n");
printf("colSum:\n");
for (int n = 0; n < 4; n++)
{
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xColSum[idxXBuf][n][i]);
}
printf("\n");
}
printf("\n");
printf("X:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
for (int j = 0; j < gemm0CtaTileNbTokens; j++)
{
auto const& elemsPerXPart = (cacheElemsPerGrain * grainsPerXPart);
auto const e = reinterpret_cast<Vec<__nv_fp8_e4m3, 16>&>(
smem.xBuf(idxXBuf)[j / elemsPerXPart].template at<true>(
i, j % elemsPerXPart / cacheElemsPerGrain))[j % cacheElemsPerGrain];
printf("%.2f, ", float(e));
if (j % 16 == 15)
{
printf("| ");
}
}
printf("\n\n");
}
}
smem.gemm1WarpGrpBar.arrive_and_wait();
#else
if (blockIdx.y == 1 && threadIdx.x == 0)
{
printf("rowMax:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xRowMax[idxXBuf][i]);
}
printf("\n");
printf("rowSum:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xRowSum[idxXBuf][i]);
}
printf("\n");
}
printf("\n");
printf("X:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
for (int j = 0; j < gemm0CtaTileNbTokens; j++)
{
auto const& elemsPerXPart = (cacheElemsPerGrain * grainsPerXPart);
auto const e = reinterpret_cast<Vec<__nv_fp8_e4m3, 16>&>(
smem.xBuf(idxXBuf)[j / elemsPerXPart].template at<true>(
i, j % elemsPerXPart / cacheElemsPerGrain))[j % cacheElemsPerGrain];
printf("%.2f, ", float(e));
if (j % 16 == 15)
{
printf("| ");
}
}
printf("\n\n");
}
}
smem.gemm1WarpGrpBar.arrive_and_wait();
#else
if (blockIdx.y == 1 && threadIdx.x == 0)
{
printf("rowMax:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xRowMax[idxXBuf][i]);
}
printf("\n");
printf("rowSum:\n");
for (int i = 0; i < ctaNbQHeads; i++)
{
printf("%f, ", smem.xRowSum[idxXBuf][i]);
}
printf("\n");
}
smem.gemm1WarpGrpBar.arrive_and_wait();
smem.gemm1WarpGrpBar.arrive_and_wait();
#endif
#endif
#if SWAP_AB
// @fixme: if first tile, no need to rescale acc. For persistent CTA, just re-initialize acc instead.
rescaleGemm1AccForNewColMax_sync(warpRank, smem.xColMax[idxXBuf], smem.xColSum[idxXBuf],
smem.gemm1AccColMax, acc, smem.gemm1AccColSum, smem.gemm1WarpGrpBar);
// @fixme: if first tile, no need to rescale acc. For persistent CTA, just re-initialize acc instead.
rescaleGemm1AccForNewColMax_sync(warpRank, smem.xColMax[idxXBuf], smem.xColSum[idxXBuf],
smem.gemm1AccColMax, acc, smem.gemm1AccColSum, smem.gemm1WarpGrpBar);
#else
rescaleGemm1AccForNewRowMax_sync(
warpRank, smem.xRowMax[idxXBuf], smem.xRowSum[idxXBuf], smem.gemm1AccColMax, acc, smem.gemm1AccColSum);
rescaleGemm1AccForNewRowMax_sync(warpRank, smem.xRowMax[idxXBuf], smem.xRowSum[idxXBuf],
smem.gemm1AccColMax, acc, smem.gemm1AccColSum);
#endif
auto& xBuf = smem.xBuf(idxXBuf);
auto& xBuf = smem.xBuf(idxXBuf);
auto const descXBase = gmma::makeMatDesc(nullptr, 0, SharedMem::XBuffer::Elem::rowBytes * 8,
gmma::getSwizzleMode<true>(SharedMem::XBuffer::Elem{}))
.raw();
auto const descXBase = gmma::makeMatDesc(nullptr, 0, SharedMem::XBuffer::Elem::rowBytes * 8,
gmma::getSwizzleMode<true>(SharedMem::XBuffer::Elem{}))
.raw();
#if CACHE_ELEM_ENUM == 0
auto const descVBase = gmma::makeMatDesc(nullptr, 0, SharedMem::VBuffer::Elem::rowBytes * 8,
gmma::getSwizzleMode<true>(SharedMem::VBuffer::Elem{}))
.raw();
auto const descVBase = gmma::makeMatDesc(nullptr, 0, SharedMem::VBuffer::Elem::rowBytes * 8,
gmma::getSwizzleMode<true>(SharedMem::VBuffer::Elem{}))
.raw();
#endif
#if SWAP_AB
//@fixme: to reduce code size, we can disable unroll and use double-buffer for LDSM in loadVTileTransposed.
#pragma unroll
for (uint32_t idxInstK = 0; idxInstK < gemm1NbGmmaInstK; idxInstK++)
{
for (uint32_t idxInstK = 0; idxInstK < gemm1NbGmmaInstK; idxInstK++)
{
#if CACHE_ELEM_ENUM == 2
Vec<RegMatAFrag, gemm1NbGmmaInstM> const fragA
= loadVTileTransposed(warpRank, laneId(), vBuf, idxInstK);
Vec<RegMatAFrag, gemm1NbGmmaInstM> const fragA
= loadVTileTransposed(warpRank, laneId(), vBuf, idxInstK);
#if !defined(NDEBUG) && DBG_PRINT
if (threadIdx.x == 0)
{
printf("fragA:\nidxInstK == %u\n", idxInstK);
}
smem.gemm1WarpGrpBar.arrive_and_wait();
for (int m = 0; m < 2; m++)
{
for (int w = 0; w < 4; w++)
if (threadIdx.x == 0)
{
if (warpRank == w)
printf("fragA:\nidxInstK == %u\n", idxInstK);
}
smem.gemm1WarpGrpBar.arrive_and_wait();
for (int m = 0; m < 2; m++)
{
for (int w = 0; w < 4; w++)
{
if (laneId() == 0)
if (warpRank == w)
{
printf(" warpRank = %u\n", warpRank);
}
__syncwarp();
for (int a = 0; a < 2; a++)
{
for (int b = 0; b < 8; b++)
if (laneId() == 0)
{
for (int c = 0; c < 2; c++)
printf(" warpRank = %u\n", warpRank);
}
__syncwarp();
for (int a = 0; a < 2; a++)
{
for (int b = 0; b < 8; b++)
{
for (int d = 0; d < 4; d++)
for (int c = 0; c < 2; c++)
{
if (laneId() == b * 4 + d)
for (int d = 0; d < 4; d++)
{
for (int e = 0; e < 4; e++)
if (laneId() == b * 4 + d)
{
auto const& elem4 = reinterpret_cast<__nv_fp8_e4m3 const(&)[4]>(
fragA[m](0, c)(a, 0));
printf("%.2f, ", float(elem4[e]));
for (int e = 0; e < 4; e++)
{
auto const& elem4 = reinterpret_cast<__nv_fp8_e4m3 const(&)[4]>(
fragA[m](0, c)(a, 0));
printf("%.2f, ", float(elem4[e]));
}
}
__syncwarp();
}
__syncwarp();
}
if (laneId() == 0)
{
printf("\n");
}
__syncwarp();
}
if (laneId() == 0)
if (laneId() == 0 && a == 0)
{
printf("\n");
printf("----------------------\n");
}
__syncwarp();
}
if (laneId() == 0 && a == 0)
{
printf("----------------------\n");
}
__syncwarp();
}
smem.gemm1WarpGrpBar.arrive_and_wait();
}
smem.gemm1WarpGrpBar.arrive_and_wait();
}
}
#endif
#endif
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * idxInstK};
auto const descX = addAddr(descXBase,
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
0, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * idxInstK};
auto const descX = addAddr(descXBase,
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
0, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
#if CACHE_ELEM_ENUM == 2
gmma::fence();
gmma::fence();
#endif
#pragma unroll
for (uint32_t idxInstM = 0; idxInstM < gemm1NbGmmaInstM; idxInstM++)
{
for (uint32_t idxInstM = 0; idxInstM < gemm1NbGmmaInstM; idxInstM++)
{
#if CACHE_ELEM_ENUM == 0
auto const descV
= addAddr(descVBase, &vBuf[idxInstM](kOffsetInGrains.get() * cacheElemsPerGrain, 0));
gmma::mma_async_shmA<MathElem, ctaNbQHeads, true, false>(
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
descV, descX, true);
auto const descV
= addAddr(descVBase, &vBuf[idxInstM](kOffsetInGrains.get() * cacheElemsPerGrain, 0));
gmma::mma_async_shmA<MathElem, ctaNbQHeads, true, false>(
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
descV, descX, true);
#elif CACHE_ELEM_ENUM == 2
gmma::mma_async_regA<MathElem, ctaNbQHeads>(
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
reinterpret_cast<uint32_t const(&)[2][2][1]>(fragA[idxInstM]), descX, true);
gmma::mma_async_regA<MathElem, ctaNbQHeads>(
reinterpret_cast<float(&)[exactDiv(ctaNbQHeads, gmma::instNBase)][2][2]>(acc(idxInstM, 0)),
reinterpret_cast<uint32_t const(&)[2][2][1]>(fragA[idxInstM]), descX, true);
#endif
}
gmma::commit_group();
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of
// gmma.
gmma::wait_group<0>();
}
gmma::commit_group();
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of
// gmma.
gmma::wait_group<0>();
}
#else
auto const descVTBase = gmma::makeMatDesc(
nullptr, 0, SharedMem::VTBuffer::rowBytes * 8, gmma::getSwizzleMode<true>(SharedMem::VTBuffer{}))
.raw();
vtBar.produced.arrive_and_wait();
auto const descVTBase = gmma::makeMatDesc(
nullptr, 0, SharedMem::VTBuffer::rowBytes * 8, gmma::getSwizzleMode<true>(SharedMem::VTBuffer{}))
.raw();
vtBar.produced.arrive_and_wait();
// if (idxIter == 1 && threadIdx.x == 0) {
// printf("vtBuf:\n");
// dbg::printArray2D<__nv_fp8_e4m3, true>(vtBuf);
// }
#pragma unroll
for (uint32_t m = 0; m < Gemm1Acc::rows; m++)
{
#pragma unroll
for (uint32_t k = 0; k < gemm1NbGmmaInstK; k++)
for (uint32_t m = 0; m < Gemm1Acc::rows; m++)
{
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * k};
auto const descX = addAddr(descXBase,
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
gmma::instM * m, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
auto const descVT = addAddr(
descVTBase, &vtBuf(0, kOffsetInGrains.template mod<SharedMem::VTBuffer::cols>().get()));
gmma::mma_async_shmA<MathElem, headElems>(
reinterpret_cast<float(&)[exactDiv(headElems, gmma::instNBase)][2][2]>(acc(m, 0)), descX,
descVT, true);
#pragma unroll
for (uint32_t k = 0; k < gemm1NbGmmaInstK; k++)
{
BoundedVal<grainsPerInstK * gemm1NbGmmaInstK> const kOffsetInGrains{grainsPerInstK * k};
auto const descX = addAddr(descXBase,
&xBuf[kOffsetInGrains.template divBy<SharedMem::XBuffer::Elem::cols>().get()](
gmma::instM * m, kOffsetInGrains.template mod<SharedMem::XBuffer::Elem::cols>().get()));
auto const descVT = addAddr(
descVTBase, &vtBuf(0, kOffsetInGrains.template mod<SharedMem::VTBuffer::cols>().get()));
gmma::mma_async_shmA<MathElem, headElems>(
reinterpret_cast<float(&)[exactDiv(headElems, gmma::instNBase)][2][2]>(acc(m, 0)), descX,
descVT, true);
}
}
}
gmma::commit_group();
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of gmma.
gmma::wait_group<0>();
gmma::commit_group();
//@fixme: delay wait and consumption to next tile. Note that fragA must also persist until finish of
// gmma.
gmma::wait_group<0>();
#endif
}
if (idxIter == nbIters - 1)
{
// gmma::wait_group should have already synchronized threads, so this may be unnecessary.
@ -1471,8 +1578,24 @@ CUBIN_EXPORT __global__
tensorMap
#endif
};
#if SKIP_SOFTMAX_ATTN
for (auto& b : smem.skipSoftmaxXBar)
{
unused(b.consumed.arrive());
}
#endif
for (uint32_t idxIter = 0; idxIter < nbIters; idxIter++)
{
uint32_t const idxVBuf = idxIter % SharedMem::nbVBuf;
auto& vBar = smem.vBar[idxVBuf];
#if SKIP_SOFTMAX_ATTN
uint32_t idxXBuf = idxIter % SharedMem::nbXBuf;
auto& skipSoftmaxXBar = smem.skipSoftmaxXBar[idxXBuf];
skipSoftmaxXBar.produced.arrive_and_wait();
bool shouldSkipSoftmaxAttn = smem.skipSoftmaxVotesGemm0ToV[idxXBuf];
skipSoftmaxXBar.consumed.arrive();
#endif
uint32_t const idxVTile = idxVTileInit + idxIter * nbSubSeq;
vTileLoader.loadPages(idxVTile);
#if USE_INPUT_KV || ENABLE_PDL == 2
@ -1506,8 +1629,20 @@ CUBIN_EXPORT __global__
}
#endif
uint32_t const idxVBuf = idxIter % SharedMem::nbVBuf;
auto& vBar = smem.vBar[idxVBuf];
#if SKIP_SOFTMAX_ATTN
if (shouldSkipSoftmaxAttn)
{
vBar.consumed.arrive_and_wait();
// compared to non-skip softmax attn, we need to increase vBar.produced count to avoid race
// condition where vBar.consumed is arrived again without wait without skip softmax attn, XVGemm
// will wait for tx_count, so its progress won't go ahead of vload warp with skip softmax attn,
// XVGemm WG may go ahead of vload warp, as previous vBar only have XVGemm WG threads and a tx_count
// (now = 0). Then it may arrive vBar.consumed before it is arrive_and_wait-ed
vBar.produced.arrive();
continue;
}
#endif
vBar.consumed.arrive_and_wait();
if (warpElectSync())
{
@ -1517,6 +1652,9 @@ CUBIN_EXPORT __global__
vTileLoader.loadData(smem.vBuf(idxVBuf)[idxPart], idxVTile, idxPart, vBar.produced);
}
}
#if SKIP_SOFTMAX_ATTN
vBar.produced.arrive();
#endif
__syncwarp();
}
}
@ -1992,9 +2130,23 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec,
#endif // SPEC_DEC
// smemColMax is persistent across multiple iterations
#if SKIP_SOFTMAX_ATTN
__device__ inline RegColWiseVec computeWarpGrpColMax_sync(CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax,
Gemm0Acc const& src, float skipSoftmaxThreshold, uint32_t* smemSkipVote, bool maybeSkip)
#else
__device__ inline RegColWiseVec computeWarpGrpColMax_sync(
CtaBarrier& warpGrpBar, ShmQWiseVec& smemColMax, Gemm0Acc const& src)
#endif
{
#if SKIP_SOFTMAX_ATTN
if (threadIdx.x == 0)
{
*smemSkipVote = maybeSkip ? 1U : 0U; // will sync before vote
}
float const lnThreshold
= log(skipSoftmaxThreshold); // this can be -inf, but should be safe as we only use it for comparison
#endif
auto colMax = RegColWiseVec::filled(Vec<float, 2>::filled(safeInitRowMax));
#pragma unroll
for (uint32_t n = 0; n < src.cols; n++)
@ -2029,6 +2181,9 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync(
}
uint32_t const lane = laneId();
#if SKIP_SOFTMAX_ATTN
auto prevOrCurrentMax = RegColWiseVec();
#if SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
if (lane < 4)
{
#pragma unroll
@ -2037,12 +2192,43 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync(
#pragma unroll
for (uint32_t j = 0; j < 2; j++)
{
atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]);
prevOrCurrentMax[n][j] = smemColMax[8 * n + 2 * lane + j];
}
}
}
warpGrpBar.arrive_and_wait();
#endif
#endif
if (lane < 4)
{
#pragma unroll
for (uint32_t n = 0; n < src.cols; n++)
{
#pragma unroll
for (uint32_t j = 0; j < 2; j++)
{
#if SKIP_SOFTMAX_ATTN && !SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
// prevOrCurrentMax <= actual smemColMax (after updates from all 4 warps done), but always >=
// smemColMax(Prev), the smemColMax value *before* this tile is computed.
// When determine whether to skip, it is safe to use prevOrCurrentMax: 1) all 4 warps' localmax <
// smemColMax(Prev), then prevOrCurrentMax == smemColMax(Prev), result not affected; 2) if some localmax
// > smemColMax(Prev), prevOrCurrentMax > smemColMax(Prev), some warps may incorrectly vote skip, but
// at least one warp whose localColMax is larger will not skip, then the tile is not skipped.
// This reduces some sync and check, but has issue when threshold > 1.
prevOrCurrentMax[n][j] = atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]);
#else
atomicMax(&smemColMax[8 * n + 2 * lane + j], colMax[n][j]);
#endif
}
}
}
warpGrpBar.arrive_and_wait();
uint32_t const idxInQuad = lane % 4;
#if SKIP_SOFTMAX_ATTN
bool localShouldSkip = true;
#endif
#pragma unroll
for (uint32_t n = 0; n < src.cols; n++)
@ -2050,10 +2236,21 @@ __device__ inline RegColWiseVec computeWarpGrpColMax_sync(
#pragma unroll
for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++)
{
#if SKIP_SOFTMAX_ATTN
if (lane < 4 && 8 * n + 2 * idxInQuad + j < headGrpSize)
{
localShouldSkip &= (colMax[n][j] - prevOrCurrentMax[n][j]) < lnThreshold;
}
#endif
assert(colMax[n][j] <= smemColMax[8 * n + 2 * idxInQuad + j]);
colMax[n][j] = smemColMax[8 * n + 2 * idxInQuad + j];
}
}
#if SKIP_SOFTMAX_ATTN
atomicAnd(smemSkipVote, static_cast<uint32_t>(localShouldSkip)); // this will be translated to redux and voteu
#endif
warpGrpBar.arrive_and_wait();
return colMax;
}
@ -2199,7 +2396,7 @@ __device__ inline void storeGemm0AccToShm(
uint32_t const idxOctInsideHalf = idxInHalf / 8;
uint32_t const idxRowInsideOct = lane % 8;
uint32_t const warpBaseC = 16 * warpRank;
auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> std::pair<uint32_t, uint32_t>
auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> mha::pair<uint32_t, uint32_t>
{
uint32_t const accR = idxAccCoreMat / Gemm0Acc::cols;
uint32_t const accC = idxAccCoreMat % Gemm0Acc::cols;
@ -3231,6 +3428,24 @@ __device__ inline void storeRotatedPairsForQ(SharedMem::QBuffer& dst,
}
#ifndef GENERATE_CUBIN
uint32_t computeNbSubSeqPerSeqHopperF8MHA(
cudaDeviceProp const& prop, uint32_t batchSize, uint32_t nbKHeads, uint32_t maxSeqLen)
{
auto const env = std::getenv("XQA_NB_SUB_SEQ");
if (env != nullptr)
{
int32_t const val = std::stoi(env);
if (val > 0)
{
return val;
}
}
float const factor = 0.25f;
return mha::min<uint32_t>(
mha::max<uint32_t>(1U, (uint32_t) round(prop.multiProcessorCount * 3 / (batchSize * nbKHeads) * factor)),
divUp(maxSeqLen, gemm0CtaTileNbTokens));
}
void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
#if SLIDING_WINDOW
uint32_t slidingWinSize,
@ -3268,6 +3483,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
// int8/fp8 KV cache.
#if SPEC_DEC
SpecDecParams const& specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
float const skipSoftmaxThresholdScaleFactor,
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
#endif
#endif
uint32_t* semaphores, void* scratch, cudaStream_t stream)
{
@ -3286,22 +3507,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
uint32_t const nbVHeads = nbKHeads;
uint32_t const nbQHeads = nbKHeads * headGrpSize;
uint32_t const nbQKVHeads = nbQHeads + nbKHeads + nbVHeads;
uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t
{
auto const env = std::getenv("XQA_NB_SUB_SEQ");
if (env != nullptr)
{
int32_t const val = std::stoi(env);
if (val > 0)
{
return val;
}
}
float const factor = 0.25f;
return mha::min<uint32_t>(
mha::max<uint32_t>(1U, (uint32_t) round(prop.multiProcessorCount * 3 / (batchSize * nbKHeads) * factor)),
divUp(maxSeqLen, gemm0CtaTileNbTokens));
}();
uint32_t const nbSubSeqPerSeq = computeNbSubSeqPerSeqHopperF8MHA(prop, batchSize, nbKHeads, maxSeqLen);
#if SPEC_DEC
uint32_t const qSeqLen = specDecParams.qSeqLen;
#else
@ -3371,6 +3577,12 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
#endif
#if SPEC_DEC
specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
skipSoftmaxThresholdScaleFactor,
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
skippedBlockCount, totalBlockCount,
#endif
#endif
semaphores, scratch);
#else

View File

@ -1272,6 +1272,19 @@ using is_void = is_same<remove_cv_t<T>, void>;
template <typename T>
inline constexpr bool is_void_v = is_void<T>::value;
#endif
#ifndef GENERATE_CUBIN
template <typename T1, typename T2>
using pair = std::pair<T1, T2>;
#else
template <typename T1, typename T2>
struct pair
{
T1 first;
T2 second;
};
#endif
} // namespace mha
#if GENERATE_CUBIN

View File

@ -50,7 +50,8 @@ using Vector = Matrix<Type, Size, 1>;
template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor,
uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum)
{
uint32_t const nbTiles = divUp(seqLen, tileSize);
auto gemm1Acc = Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor>::Zero().eval();
@ -61,6 +62,16 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
float const qkScale = qScale * kvScale / sqrtf(validElemsPerHead);
uint32_t const seqBeg = (seqLen < slidingWinSize ? 0 : seqLen - slidingWinSize);
uint32_t const idxTileBeg = seqBeg / tileSize;
uint32_t const nbSubSeq = (multiBlockNum > 0 && nbTiles >= 2) ? mha::min(nbTiles, multiBlockNum) : 1;
std::vector<Eigen::Vector<float, headGrpSize>> skipRowMaxs(nbSubSeq);
for (uint32_t i = 0; i < nbSubSeq; i++)
{
skipRowMaxs[i].fill(-INFINITY);
}
bool const disableSkipForShortSeq = (seqLen < skipSoftmaxThresholdScaleFactor);
float const skipSoftmaxThreshold = disableSkipForShortSeq ? 0.0f : skipSoftmaxThresholdScaleFactor / seqLen;
for (uint32_t idxTile = idxTileBeg; idxTile < nbTiles; idxTile++)
{
Eigen::Matrix<float, headGrpSize, tileSize, Eigen::RowMajor> gemm0Acc;
@ -88,7 +99,22 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
}
}
Eigen::Vector<float, headGrpSize> const tileRowMax = gemm0Acc.rowwise().maxCoeff().cwiseMax(rowMax).eval();
Eigen::Vector<float, headGrpSize> const localRowMax = gemm0Acc.rowwise().maxCoeff().eval();
Eigen::Vector<float, headGrpSize> const tileRowMax = localRowMax.cwiseMax(rowMax).eval();
auto const prevSkipRowMax = skipRowMaxs[idxTile % nbSubSeq];
skipRowMaxs[idxTile % nbSubSeq] = localRowMax.cwiseMax(skipRowMaxs[idxTile % nbSubSeq]).eval();
if (!disableSkipForShortSeq && skipSoftmaxThreshold > 0)
{
*totalBlockCount += 1;
auto const skipSoftmaxMask = ((localRowMax - prevSkipRowMax).array() < std::log(skipSoftmaxThreshold));
bool const skipBlock = skipSoftmaxMask.all() && ((idxTile - idxTileBeg) >= nbSubSeq);
if (skipBlock)
{
*skippedBlockCount += 1;
continue;
}
}
Eigen::Matrix<float, headGrpSize, tileSize, Eigen::RowMajor> tileX
= (gemm0Acc.colwise() - tileRowMax).array().exp().eval();
@ -138,7 +164,8 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
template Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> \
refFlashAttention<prec, tileSize, isPaged, useBeamSearch>(IOHead const* q, \
CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, \
float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, \
float skipSoftmaxThreshold, uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum)
INSTANTIATE_refFlashAttention(CacheElem, 64, false, false);
INSTANTIATE_refFlashAttention(CacheElem, 64, false, true);

View File

@ -88,7 +88,8 @@ struct CacheSeq<true, true>
template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks);
float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor,
uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum);
template <typename MathElem, bool isPaged, bool useBeamSearch>
#if SPEC_DEC

View File

@ -150,7 +150,8 @@ template <uint32_t nbKHeads>
#endif
#endif
void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, bool verbose = false,
bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30)
bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30,
float skipSoftmaxThresholdScaleFactor = 0.0f)
{
#if IS_MLA
if (nbKHeads != 1)
@ -224,6 +225,12 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
seqLen = (16U << 20) / gmemCacheHeadBytes; // 32MB per K+V head.
}
ctxLen = std::min(ctxLen, seqLen);
uint32_t skippedBlockCount = 0;
uint32_t totalBlockCount = 0;
if (skipSoftmaxThresholdScaleFactor > 0)
{
assert(useQGMMA);
}
float const kScale = cacheElemSize == 2 ? 1.f : 1 / 4.f;
float const vScale = kScale;
float const qScale = 1.f;
@ -329,6 +336,17 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
auto const rcpOutScale = ManagedMemBuf<float>(1);
auto const seqLenList = ManagedMemBuf<uint32_t[beamWidth]>(batchSize);
auto const ctxLenList = ManagedMemBuf<uint32_t[beamWidth]>(batchSize);
#if SKIP_SOFTMAX_ATTN
#ifdef SKIP_SOFTMAX_ATTN_BLOCK_STATS
auto const kernelSkippedBlockCount = ManagedMemBuf<uint32_t>(1);
auto const kernelTotalBlockCount = ManagedMemBuf<uint32_t>(1);
kernelSkippedBlockCount[0] = 0;
kernelTotalBlockCount[0] = 0;
#endif
#else
EXPECT_EQ(skipSoftmaxThresholdScaleFactor, 0.0f)
<< "Got non-zero skipSoftmaxThresholdScaleFactor while SKIP_SOFTMAX_ATTN is not enabled.";
#endif
#if USE_PAGED_KV_CACHE
auto const pageListBuf = ManagedMemBuf<std::byte>(pageListBytes);
#if PAGED_KV_CACHE_LAYOUT == 1
@ -726,6 +744,11 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
maxSeqLen, &seqLenList[0][0], batchSize, kvCacheScale.get(), semaphores.get(), scratch, stream);
};
#else
auto multiBlockNum = [&]()
{
auto const calcFunc = useQGMMA ? &computeNbSubSeqPerSeqHopperF8MHA : &computeNbSubSeqPerSeqMHA;
return calcFunc(prop, batchSize, nbKHeads, maxSeqLen);
}();
auto runKernel = [&]()
{
auto const launchFunc = useQGMMA ? &launchHopperF8MHA : &launchMHA;
@ -776,6 +799,12 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
batchSize, kvCacheScale.get(),
#if SPEC_DEC
specDecParams,
#endif
#if SKIP_SOFTMAX_ATTN
skipSoftmaxThresholdScaleFactor,
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
kernelSkippedBlockCount.get(), kernelTotalBlockCount.get(),
#endif
#endif
semaphores.get(), scratch, stream);
checkCuda(cudaGetLastError());
@ -813,6 +842,10 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
checkCuda(cudaEventRecord(toc, stream));
prefetchToDevice(cudaCpuDeviceId);
checkCuda(cudaStreamSynchronize(stream));
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
kernelSkippedBlockCount[0] /= nbIters;
kernelTotalBlockCount[0] /= nbIters;
#endif
if (testPerf)
{
float ms;
@ -849,6 +882,15 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
= totalNbCacheLoadBytes + inputBytes + outputBytes; // we ignore page indices and beam search indices.
float const dramSolTime = totalTraffic / bandwidth * 1E3f;
float const dramSolRatio = dramSolTime / ms;
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
size_t const totalNbCacheLoadWithSkip = gmemCacheHeadBytes
* (nbKHeads + nbVHeads * (1 - 1.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]))
* nbLoadedCacheTokens;
float const totalTrafficWithSkip
= totalNbCacheLoadWithSkip + inputBytes + outputBytes; // we ignore page indices and beam search indices.
float const dramSolTimeWithSkip = totalTrafficWithSkip / bandwidth * 1E3f;
float const dramSolRatioWithSkip = dramSolTimeWithSkip / ms;
#endif
if (verbose)
{
printf("done\n");
@ -863,7 +905,13 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
}
float const tops = headGrpSize * qSeqLen * float(seqLen) * (validElemsPerKHead + validElemsPerVHead) * 2
* nbKHeads * batchSize / (ms * 1E-3F) * 1E-12F;
#if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0],
kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]);
printf("dramSolRatioWithSkip: %f%% (%f ms, TOPS = %f)\n", dramSolRatioWithSkip * 100, ms, tops);
#else
printf("dramSolRatio: %f%% (%f ms, TOPS = %f)\n", dramSolRatio * 100, ms, tops);
#endif
}
if (refCheck)
{
@ -1084,8 +1132,8 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
if (useQGMMA)
{
refOutput = refFlashAttention<CacheElem, 64>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize,
refAttentionSinks);
vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, refAttentionSinks,
skipSoftmaxThresholdScaleFactor, &skippedBlockCount, &totalBlockCount, multiBlockNum);
// refOutput = refAttention<CacheElem>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
// vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize);
}
@ -1132,6 +1180,14 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
#endif
}
}
#if SKIP_SOFTMAX_ATTN
printf("host skippedBlockCount: %d/%d (%.2f%%)\n", skippedBlockCount, totalBlockCount,
totalBlockCount == 0 ? 0.0f : 100.0f * skippedBlockCount / totalBlockCount);
#if SKIP_SOFTMAX_ATTN_BLOCK_STATS
printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0],
kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]);
#endif
#endif
if (saveData)
{
fout_refOutput.close();
@ -1253,6 +1309,14 @@ TEST(RefCheck, llama_V2_70b)
#if SLIDING_WINDOW
runTest<2>(2, 4096, false, true, false, false, false, ~0, 256);
runTest<2>(2, 400, false, true, false, false, false, ~0U, 256);
#endif
#if SKIP_SOFTMAX_ATTN
runTest<1>(32, 2048, false, true, false, false, false, ~0U, 1U << 30, 0.f);
runTest<4>(32, 1538, false, true, false, false, false, ~0U, 1U << 30, 1280.f);
runTest<2>(32, 4096, false, true, false, false, false, ~0U, 1U << 30, 125.f);
runTest<4>(32, 300, false, true, false, false, false, ~0U, 1U << 30, 80.f);
runTest<4>(32, 500, false, true, false, false, false, ~0U, 1U << 30, 501.0f);
runTest<4>(32, 500, false, true, false, false, false, ~0U, 1U << 30, 500.f);
#endif
runTest<8>(120, 367, false, true);
runTest<8>(1792, 2048, false, true);

View File

@ -1556,7 +1556,7 @@ void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAm
}
}
std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(
std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds, bool pinBlocks)
{
SizeType32 numBlocksStoredForReuse = 0;
@ -1569,7 +1569,7 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s
auto numBlocks = blockKeys.size();
std::vector<BlockPtr> storedBlocks;
std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
std::vector<KVCacheBlock::IdType> pinnedBlockIds;
for (std::size_t blockCnt = 0; blockCnt < numBlocks; ++blockCnt)
{
auto const bid = blockIds[blockCnt];
@ -1620,14 +1620,14 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s
if (pinBlocks)
{
searchRoot->incRefCount();
pinnedBlockIds.push_back(searchRoot->getBlockId());
}
lastStoredId = searchRoot->getBlockId();
}
if (mEventManager)
{
mEventManager->enqueueStoredEvent(storedBlocks, mWindowSize);
}
return {numBlocksStoredForReuse, lastStoredId};
return {numBlocksStoredForReuse, pinnedBlockIds};
}
void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx)
@ -1715,15 +1715,15 @@ std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::c
return mEventManager ? mEventManager->getEvents(timeout) : std::deque<tle::KVCacheEvent>{};
}
std::optional<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(
std::vector<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
{
std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
std::vector<KVCacheBlock::IdType> pinnedBlockIds;
for (auto& [_, manager] : mWindowBlockManagers)
{
lastStoredId = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
pinnedBlockIds = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
}
return lastStoredId;
return pinnedBlockIds;
}
std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
@ -1767,7 +1767,7 @@ void BlockManager::pinBlocks(GenerationRequest& sequence)
}
}
void BlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
void BlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
{
// Use the first window size
if (mWindowBlockManagers.empty())
@ -1775,7 +1775,7 @@ void BlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
return;
}
auto& firstManager = mWindowBlockManagers.begin()->second;
firstManager.unpinBlocksById(blockId);
firstManager.unpinBlocksById(blockIds);
}
void WindowBlockManager::pinBlocks(GenerationRequest& sequence)
@ -1788,21 +1788,26 @@ void WindowBlockManager::pinBlocks(GenerationRequest& sequence)
}
}
void WindowBlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
void WindowBlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
{
if (blockId < 0 || static_cast<size_t>(blockId) >= mAllBlocksById.size())
if (blockIds.empty())
{
return;
}
auto block = mAllBlocksById[blockId];
while (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
for (auto const& blockId : blockIds)
{
block->decRefCount();
if (!block->hasRefs())
TLLM_CHECK_WITH_INFO(blockId >= 0 && static_cast<size_t>(blockId) < mAllBlocksById.size(),
"Block id %d is out of range", blockId);
auto block = mAllBlocksById[blockId];
if (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
{
mEvictionPolicy->releaseBlock(block);
block->decRefCount();
if (!block->hasRefs())
{
mEvictionPolicy->releaseBlock(block);
}
}
block = std::move(block->getPrevBlock());
}
}
@ -1870,7 +1875,7 @@ void WindowBlockManager::storeNewBlock(GenerationRequest& sequence, OptionalRef<
(void) storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
}
std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
std::vector<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
{
auto constexpr beamIdx = 0;
@ -1883,7 +1888,10 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);
auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
return storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks).second;
auto [numStored, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks);
return pinnedBlockIds;
}
std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
@ -1922,7 +1930,7 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
std::transform(allocatedBlocks.begin(), allocatedBlocks.end(), cacheBlockIds.begin(),
[](BlockPtr const& block) { return block->getBlockId(); });
auto [numBlocksStoredForReuse, lastStoredId] = storeBlocks(std::move(blockKeys), cacheBlockIds);
auto [numBlocksStoredForReuse, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds);
TLLM_LOG_DEBUG("%s::releaseBlocks Request %lu, %d blocks stored for reuse", mLogPrefix.c_str(),
sequence.getRequestId(), numBlocksStoredForReuse);
}
@ -2499,15 +2507,14 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
return lastStoredId;
}
std::optional<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
std::vector<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
{
TLLM_LOG_TRACE("[%s]::%s start", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
auto& sequence = getSequence(requestId);
std::optional<KVCacheBlock::IdType> lastStoredId
= mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
auto pinnedBlockIds = mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
TLLM_LOG_TRACE("[%s]::%s stop", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
return lastStoredId;
return pinnedBlockIds;
}
void KVCacheManager::schedulingRemoveSequence(RequestIdType requestId)
@ -2522,9 +2529,9 @@ void KVCacheManager::pinBlocks(RequestIdType requestId)
mBlockManager.pinBlocks(sequence);
}
void KVCacheManager::unpinBlocksById(KVCacheBlock::IdType blockId)
void KVCacheManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
{
mBlockManager.unpinBlocksById(blockId);
mBlockManager.unpinBlocksById(blockIds);
}
SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSlotOffset, RequestIdType requestId) const

View File

@ -298,6 +298,11 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
xqaParams.use_sparse_attention = useTllmGenSparseAttention();
// Skip softmax threshold.
xqaParams.skip_softmax_threshold_scale_factor = mSkipSoftmaxThresholdScaleFactorDecode;
#ifdef SKIP_SOFTMAX_STAT
// Statistics of skip-softmax, pointers of device memory for output
xqaParams.skip_softmax_total_blocks = mSkipSoftmaxTotalBlocks;
xqaParams.skip_softmax_skipped_blocks = mSkipSoftmaxSkippedBlocks;
#endif
// Cross attention parameters.
xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;

View File

@ -2179,11 +2179,11 @@ void Executor::Impl::terminateContextFinishedRequests(InTransList& inTransmissio
auto req = item.request;
if (req->isDisaggContextCompleteState())
{
// If lastBlockId was tracked, unpin it. Otherwise, just terminate.
// If pinnedBlockIds were tracked, unpin them. Otherwise, just terminate.
auto kvMgr = mModel->getKVCacheManager();
if (kvMgr && item.lastBlockId.has_value())
if (kvMgr && !item.pinnedBlockIds.empty())
{
kvMgr->unpinBlocksById(item.lastBlockId.value());
kvMgr->unpinBlocksById(item.pinnedBlockIds);
}
else
{
@ -2234,14 +2234,14 @@ Executor::Impl::RequestList Executor::Impl::populateNewResponses(
// move the in transmission requests to another tracker
if (llmReq->isDisaggContextTransmissionState())
{
std::optional<SizeType32> lastBlockId{};
std::vector<SizeType32> pinnedBlockIds{};
auto kvMgr = mModel->getKVCacheManager();
if (kvMgr && kvMgr->isEnableBlockReuse() && !kvMgr->getBlockManager().isVariableWindow())
{
lastBlockId = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /*pinBlocks=*/true);
pinnedBlockIds = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /*pinBlocks=*/true);
mModel->terminateRequest(llmReq);
}
inTransmissionRequests.push_back(InTransmissionItem{*it, lastBlockId});
inTransmissionRequests.push_back(InTransmissionItem{*it, pinnedBlockIds});
}
finishedRequests.push_back(*it);
it = activeRequests.erase(it);

View File

@ -80,12 +80,12 @@ class Executor::Impl
using RequestList = std::list<LlmRequestPtr>;
// When block reuse is enabled for context worker for disaggregated serving,
// we need to store the last block id so that we can unpin the block when
// we need to store the pinned block ids so that we can unpin them when
// the request is finished.
struct InTransmissionItem
{
LlmRequestPtr request;
std::optional<SizeType32> lastBlockId;
std::vector<SizeType32> pinnedBlockIds;
};
using InTransList = std::list<InTransmissionItem>;

View File

@ -105,7 +105,8 @@ CubinObj CompileEngine::compile() const
// scratch in this case.
/*use_input_kv=*/applyRoPEInXqaKernel,
/*rope_style=*/ropeStyle,
/*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree};
/*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree,
/*use_skip_softmax_attn=*/mXqaParams.skip_softmax_threshold_scale_factor != 0};
if (context.kernel_type == TLLM_XQA_JIT_MLA)
{
auto const& c = context;

View File

@ -232,6 +232,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
jit::CubinObj const* const cubinObj = mResource->getCubinObjRegistry()->getCubin(key);
TLLM_CHECK(cubinObj != nullptr && cubinObj->isInitialized());
bool const isSpecDec = xqaParams.multi_query_tokens;
bool const isSkipSoftmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
bool const isHMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kAMPERE_WARP_SPECIALIZED);
bool const isGMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kHOPPER_WARP_SPECIALIZED);
bool const isMLAKernel = (cubinObj->getKernelType() == XQAKernelType::kSM120_MLA);
@ -378,7 +379,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
.mask = reinterpret_cast<SpecDecParams::MaskType const*>(xqaParams.spec_decoding_packed_mask)};
};
constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 16;
constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 19;
uint32_t idxNextParam = 0;
void* kernelParams[kMAX_NB_KERNEL_PARAMS];
auto appendParam = [&](auto* p) mutable
@ -514,6 +515,16 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
appendParam(&specDecParams);
specDecBlocks = divUp(specDecParams.qSeqLen, 64 / num_q_heads_over_kv);
}
if (isSkipSoftmax)
{
TLLM_CHECK_WITH_INFO(isGMMAKernel, "skip softmax is only supported for GMMA kernel for now.");
TLLM_CHECK_WITH_INFO(!isSpecDec, "skip softmax is not supported with spec dec for now.");
appendParam(&xqaParams.skip_softmax_threshold_scale_factor);
#ifdef SKIP_SOFTMAX_STAT
appendParam(&xqaParams.skip_softmax_total_blocks);
appendParam(&xqaParams.skip_softmax_skipped_blocks);
#endif
}
appendParam(&launchParams.semaphores);
appendParam(&launchParams.scratch);
kernelParams[idxNextParam] = nullptr; // one extra nullptr at end as guard.

View File

@ -96,10 +96,16 @@ bool supportConfigQGMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlu
{
return false;
}
if (xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)
if (!contains({DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_E4M3}, xqaParams.kv_cache_data_type))
{
return false;
}
bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
if (!is_skip_softmax && xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)
{
// Only use hopper kernel with fp16/bf16 kv cache data type when skip softmax is enabled
return false;
}
if (xqaParams.beam_width != 1)
{
return false;
@ -168,6 +174,11 @@ bool supportConfigHMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlug
{
return false;
}
bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
if (is_skip_softmax)
{
return false;
}
return true;
}
@ -201,6 +212,11 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi
{
return false;
}
bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
if (is_skip_softmax)
{
return false;
}
return true;
}

View File

@ -66,6 +66,7 @@ extern "C"
bool is_spec_dec_tree
= true; // useful only when multi_query_tokens, should be true unless using linear tree in spec-dec.
bool use_skip_softmax_attn;
} tllmXqaJitContext;
// tllmXqaJitProgram is an opaque handle for a program.

View File

@ -215,6 +215,10 @@ tllmXqaJitStatus getMacroFlags(tllmXqaJitContext const* context, std::vector<std
macros["USE_INPUT_KV"] = context->use_input_kv ? "1" : "0";
macros["ROPE_STYLE"] = std::to_string(int(context->rope_style));
macros["IS_SPEC_DEC_TREE"] = context->is_spec_dec_tree ? "1" : "0";
macros["SKIP_SOFTMAX_ATTN"] = context->use_skip_softmax_attn ? "1" : "0";
#ifdef SKIP_SOFTMAX_STAT
macros["SKIP_SOFTMAX_ATTN_BLOCK_STATS"] = context->use_skip_softmax_attn ? "1" : "0";
#endif
// Without these macros, NVRTC uses precompiled headers for cuda_fp16.h etc.
// Linking might fail due to ABI incompatibility.

View File

@ -493,6 +493,10 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo
{
SUPPORT_RETURN_FALSE("streaming-llm");
}
if (xqaParams.skip_softmax_threshold_scale_factor != 0)
{
SUPPORT_RETURN_FALSE("skip_softmax_threshold_scale_factor");
}
// OPTIMIZE: For the standard generation-phase MHA, there are still extra limitations.
// NOTE: Medusa mode = Multi_query_tokens > 1.

View File

@ -64,6 +64,21 @@ CUtensorMapSwizzle getSwizzleMode(uint32_t partBytes)
}
};
CUtensorMapDataType_enum getDataTypeFromXqaParams(XQAParams const& xqaParams)
{
if (xqaParams.kv_cache_data_type == DATA_TYPE_BF16)
{
return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
}
else if (xqaParams.kv_cache_data_type == DATA_TYPE_FP16)
{
return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
}
TLLM_CHECK(xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 || xqaParams.kv_cache_data_type == DATA_TYPE_E5M2
|| xqaParams.kv_cache_data_type == DATA_TYPE_INT8);
return CU_TENSOR_MAP_DATA_TYPE_UINT8;
}
CUtensorMap makeTensorMapForQ(std::shared_ptr<CUDADriverWrapper> const& driver, void const* addr,
CUtensorMapDataType_enum dataType, uint32_t headElems, uint32_t totalNbHeads, uint32_t partElems, uint32_t boxHeads)
{
@ -131,24 +146,26 @@ CUtensorMap makeTensorMapForHopperXqaKVCache(
if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
{
uint32_t const headElems = xqaParams.head_size;
uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8);
CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
uint32_t const elemBytes = getElemBytes(dataType);
TLLM_CHECK(headElems <= 256);
uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256);
uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes;
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8,
xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size,
xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
}
else
{
static_assert(std::is_same_v<KVCacheBuffer, KVLinearBuffer>);
uint32_t const headElems = xqaParams.head_size;
uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8);
CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
uint32_t const elemBytes = getElemBytes(dataType);
TLLM_CHECK(headElems <= 256);
uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256);
uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes;
return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, CU_TENSOR_MAP_DATA_TYPE_UINT8,
xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width,
xqaParams.batch_size, partElems);
return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, dataType, xqaParams.head_size,
xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width, xqaParams.batch_size,
partElems);
}
}
@ -161,11 +178,12 @@ template <typename KVCacheBuffer>
CUtensorMap makeTensorMapForXqaMlaKVCache(std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver,
XQAParams const& xqaParams, KVCacheBuffer const& kv_cache_buffer, bool forK)
{
CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
uint32_t const partElems = (forK ? 64 : 128);
if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
{
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8,
xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size,
xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
}
else
{
@ -183,7 +201,7 @@ CUtensorMap makeTensorMapForXqaMlaQ(
std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver, XQAParams const& xqaParams, void const* q)
{
uint32_t const partElems = 64;
return makeTensorMapForQ(driver, q, CU_TENSOR_MAP_DATA_TYPE_UINT8, xqaParams.head_size,
return makeTensorMapForQ(driver, q, getDataTypeFromXqaParams(xqaParams), xqaParams.head_size,
xqaParams.num_q_heads * xqaParams.total_num_input_tokens, partElems, xqaParams.num_q_heads);
}
} // namespace kernels

View File

@ -119,7 +119,12 @@ struct XQAParams
bool use_sparse_attention = false;
// Skip softmax threshold.
float skip_softmax_threshold_scale_factor = 0.0f;
float skip_softmax_threshold_scale_factor = 0;
#ifdef SKIP_SOFTMAX_STAT
uint32_t* skip_softmax_total_blocks = nullptr;
uint32_t* skip_softmax_skipped_blocks = nullptr;
#endif
cudaStream_t stream = 0;
// layer index
@ -199,6 +204,10 @@ struct XQAParams
<< "sparse_params: " << sparse_params.toString() << std::endl
<< "use_sparse_attention :" << (use_sparse_attention ? "true" : "false") << std ::endl
<< "skip_softmax_threshold_scale_factor :" << skip_softmax_threshold_scale_factor << std ::endl
#ifdef SKIP_SOFTMAX_STAT
<< "skip_softmax_total_blocks :" << skip_softmax_total_blocks << std ::endl
<< "skip_softmax_skipped_blocks :" << skip_softmax_skipped_blocks << std ::endl
#endif
<< "stream :" << stream;
return ss.str();

View File

@ -161,6 +161,7 @@ void initBindings(nb::module_& m)
.def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
.def_prop_ro("is_finished", &GenLlmReq::isFinished)
.def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
.def_prop_ro("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
.def_prop_rw(
"context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
.def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)

View File

@ -123,7 +123,7 @@ public:
NB_OVERRIDE_PURE(removeSequence, requestId, llmRequest, pinOnRelease);
}
std::optional<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
std::vector<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest, bool pinBlocks) override
{
NB_OVERRIDE_PURE(storeBlocksForReuse, requestId, llmRequest, pinBlocks);

View File

@ -165,6 +165,7 @@ void initBindings(pybind11::module_& m)
.def("set_finished_reason", &GenLlmReq::setFinishedReason, py::arg("finish_reason"), py::arg("beam"))
.def_property_readonly("is_finished", &GenLlmReq::isFinished)
.def_property_readonly("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
.def_property_readonly("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
.def_property(
"context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
.def_property_readonly("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)

View File

@ -111,10 +111,10 @@ public:
requestId, llmRequest, pinOnRelease);
}
std::optional<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
std::vector<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest, bool pinBlocks) override
{
PYBIND11_OVERLOAD_PURE(std::optional<tbk::KVCacheBlock::IdType>, tbk::BaseKVCacheManager, storeBlocksForReuse,
PYBIND11_OVERLOAD_PURE(std::vector<tbk::KVCacheBlock::IdType>, tbk::BaseKVCacheManager, storeBlocksForReuse,
requestId, llmRequest, pinBlocks);
}

View File

@ -4066,11 +4066,13 @@ TEST_F(KVCacheManagerTest, PinAndUnpinBlocksById)
kvCacheManager.pinBlocks(requestId);
auto lastBlockIdOpt = kvCacheManager.getLastBlockId(requestId);
ASSERT_TRUE(lastBlockIdOpt.has_value());
auto const& allBlockIds = kvCacheManager.getCacheBlockIds(requestId, maxAttentionWindow)[0];
std::vector<SizeType32> pinnedBlockIds(allBlockIds.begin(), allBlockIds.end());
(void) kvCacheManager.removeSequence(requestId, llmRequest);
auto const freeAfterRemovePinned = kvCacheManager.getNumFreeBlocks();
EXPECT_LT(freeAfterRemovePinned, totalBlocks);
kvCacheManager.unpinBlocksById(lastBlockIdOpt.value());
kvCacheManager.unpinBlocksById(pinnedBlockIds);
auto const freeAfterUnpin = kvCacheManager.getNumFreeBlocks();
EXPECT_EQ(freeAfterUnpin, totalBlocks);
}

View File

@ -227,3 +227,7 @@ Run `bench.sh` to begin a serving benchmark. This will take a long time if you r
```shell
./bench.sh
```
## Known Issues
Qwen3-Next-80B-A3B exhibits relatively low accuracy on the SciCode-AA-v2 benchmark.

View File

@ -38,13 +38,14 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
| `DeepseekV3ForCausalLM` | Yes | Yes | Yes | Yes | Yes [^1] | Yes | No | No | Yes | Yes | Yes [^2] | N/A | Yes | Yes |
| `DeepseekV32ForCausalLM` | Yes | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | N/A | Yes | Yes |
| `Qwen3MoeForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | N/A | Yes | Yes |
| `Qwen3NextForCausalLM` | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested |
| `Qwen3NextForCausalLM` [^3] | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested |
| `Llama4ForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Untested | N/A | Yes | Yes |
| `GptOssForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes [^3] | Yes | Yes | Yes | N/A | Yes | Yes |
| `GptOssForCausalLM` | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes [^4] | Yes | Yes | Yes | N/A | Yes | Yes |
[^1]: Chunked Prefill for MLA can only be enabled on SM100/SM103.
[^2]: KV cache reuse for MLA can only be enabled on SM90/SM100/SM103 and in BF16/FP8 KV cache dtype.
[^3]: Overlap scheduler isn't supported when using EAGLE-3(Two Model Engine) for GPT-OSS.
[^3]: Qwen3-Next-80B-A3B exhibits relatively low accuracy on the SciCode-AA-v2 benchmark.
[^4]: Overlap scheduler isn't supported when using EAGLE-3(Two Model Engine) for GPT-OSS.
# Multimodal Feature Support Matrix (PyTorch Backend)

View File

@ -0,0 +1 @@
attn_backend: triton

View File

@ -65,7 +65,7 @@ models:
- name: bigcode/starcoder2-7b
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
- name: bigcode/starcoder2-15b-instruct-v0.1
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'attn_backend_triton.yaml']
- name: deepseek-ai/DeepSeek-Prover-V1.5-SFT
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'compile_backend_torch_cudagraph.yaml']
- name: deepseek-ai/DeepSeek-Prover-V2-7B
@ -118,8 +118,6 @@ models:
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
- name: google/gemma-3-27b-it
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
- name: google/gemma-3-2b-it
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
- name: deepseek-ai/DeepSeek-V2.5
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
# DISABLED: Network timeout downloading from Hugging Face
@ -145,8 +143,6 @@ models:
# DISABLED: Graph transformation error in auto-deploy
# - name: neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8
# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
- name: TheBloke/falcon-40b-instruct-GPTQ
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
- name: Qwen/QwQ-32B
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'compile_backend_torch_cudagraph.yaml']
- name: google/gemma-2-27b-it
@ -159,7 +155,7 @@ models:
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
- name: Qwen/QwQ-32B-Preview
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'compile_backend_torch_cudagraph.yaml']
- name: Qwen/Qwen3-Coder-32B-Instruct
- name: Qwen/Qwen3-Coder-30B-A3B-Instruct
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
- name: Qwen/Qwen3-235B-A22B-Instruct-2507
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
@ -222,3 +218,5 @@ models:
yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_scout.yaml']
- name: meta-llama/Llama-4-Maverick-17B-128E-Instruct
yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_maverick_lite.yaml']
- name: nvidia/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-010726
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml','super_v3.yaml']

View File

@ -1,9 +1,6 @@
# EXAONE
This document shows how to build and run a [EXAONE](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) model in TensorRT-LLM.
The TensorRT LLM EXAONE implementation is based on the LLaMA model. The implementation can be found in [llama/model.py](../../../../tensorrt_llm/models/llama/model.py).
See the LLaMA example [`examples/models/core/llama`](../llama) for details.
This document shows how to build and run [EXAONE](https://huggingface.co/LGAI-EXAONE) models in TensorRT-LLM.
- [EXAONE](#exaone)
- [Support Matrix](#support-matrix)
@ -11,31 +8,51 @@ See the LLaMA example [`examples/models/core/llama`](../llama) for details.
- [EXAONE-3.0](#exaone-30)
- [EXAONE-Deep](#exaone-deep)
- [EXAONE-4.0](#exaone-40)
- [Usage](#usage)
- [PyTorch flow](#pytorch-flow)
-[PyTorch flow Quantization](#pytorch-flow-quantization)
- [TRT Flow](#trt-flow)
- [K-EXAONE](#k-exaone)
- [PyTorch flow](#pytorch-flow)
- [Running EXAONE-4.0](#running-exaone-40)
- [Running K-EXAONE](#running-k-exaone)
- [MoE Backend Options](#moe-backend-options)
- [PyTorch flow Quantization](#pytorch-flow-quantization)
- [FP8 Quantization](#fp8-quantization)
- [NVFP4 Quantization](#nvfp4-quantization)
- [Running the TensorRT LLM Server](#running-the-tensorrt-llm-server)
- [Running Aggregated TensorRT LLM Server](#running-aggregated-tensorrt-llm-server)
- [Creating the Extra Options Configuration](#creating-the-extra-options-configuration)
- [Launch trtllm-serve OpenAI-compatible API server](#launch-trtllm-serve-openai-compatible-api-server)
- [Running Disaggregated TensorRT LLM Server](#running-disaggregated-tensorrt-llm-server)
- [Step 1: Set Environment Variables](#step-1-set-environment-variables)
- [Step 2: Create Configuration Files](#step-2-create-configuration-files)
- [Step 3: Launch the Disaggregated Server](#step-3-launch-the-disaggregated-server)
- [TRT flow](#trt-flow)
- [Convert checkpoint and build TensorRT engine(s)](#convert-checkpoint-and-build-tensorrt-engines)
- [FP8 Post-Training Quantization](#fp8-post-training-quantization)
- [SmoothQuant](#smoothquant)
- [Groupwise quantization (AWQ)](#groupwise-quantization-awq)
- [W4A16 AWQ with FP8 GEMM (W4A8 AWQ)](#w4a16-awq-with-fp8-gemm-w4a8-awq)
- [W4A16 AWQ with FP8 GEMM (W4A8 AWQ)](#w4a16-awq-with-fp8-gemm-w4a8-awq)
- [Run Engine](#run-engine)
- [Troubleshootings](#troubleshootings)
- [Troubleshootings for EXAONE-4.0](#troubleshootings-for-exaone-40)
- [Troubleshootings for K-EXAONE](#troubleshootings-for-k-exaone)
## Support Matrix
* FP16
* BF16
* Tensor Parallel
* Tensor Parallel (TP)
* Expert Parallel (EP) (K-EXAONE only)
* Attention Data Parallel (ADP) (K-EXAONE only)
* Disaggregated Serving
* FP8
* INT8 & INT4 Weight-Only
* INT8 SmoothQuant
* INT4 AWQ & W4A8 AWQ
* NVFP4 (K-EXAONE only)
## Supported Models
**Note:**
- **EXAONE-3.0** and **EXAONE-Deep** are supported using the [TRT Flow](#trt-flow).
- **EXAONE-4.0** is supported using the [PyTorch flow](#pytorch-flow).
**Note:**
- **EXAONE-3.0** & **EXAONE-Deep** are supported using the [TRT Flow](#trt-flow).
- **EXAONE-4.0** & **K-EXAONE** are supported using the [PyTorch flow](#pytorch-flow).
Please refer to the corresponding sections below for usage instructions and examples for each model.
@ -59,23 +76,33 @@ git clone https://huggingface.co/LGAI-EXAONE/EXAONE-Deep-2.4B $HF_MODEL_DIR
### EXAONE-4.0
Download he HuggingFace checkpoints of EXAONE-4.0 model. Here, we only use the `EXAONE-4.0-32B` model for the example. From EXAONE-4.0 model, we support only on PyTorch flow.
Download the HuggingFace checkpoints of the EXAONE-4.0 model. Here, we use the `EXAONE-4.0-32B` model as an example. EXAONE-4.0 is supported only via the PyTorch flow.
```bash
export HF_MODEL_DIR=hf_models/exaone4
git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B $HF_MODEL_DIR
```
### Pytorch flow
### K-EXAONE
K-EXAONE is a Mixture of Experts (MoE) model based on the EXAONE architecture. It features a hybrid architecture with both dense and MoE layers, sliding window attention, and supports FP8 and NVFP4 quantization for efficient inference.
Download the HuggingFace checkpoints of the K-EXAONE model:
```bash
export HF_MODEL_DIR=hf_models/kexaone
git clone https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B $HF_MODEL_DIR
```
## PyTorch flow
### Running EXAONE-4.0
To quickly run EXAONE-4.0 models, you can use [examples/llm-api/quickstart_advanced.py](../../../llm-api/quickstart_advanced.py):
```bash
python ../../../llm-api/quickstart_advanced.py --model_dir hf_models/$MODEL_NAME --disable_kv_cache_reuse
python ../../../llm-api/quickstart_advanced.py --model_dir $HF_MODEL_DIR
```
SWA currently does not support kv_cache_reuse. Please make sure to disable KV cache reuse when running with SWA.
The output will be like:
```bash
[0] Prompt: 'Hello, my name is', Generated text: " [Your Name], and I'm a [Your Profession]. I'm here to learn and share with you.\n\nBest regards,\n[Your Name]\n\nThis letter is concise, professional, and clearly states who you are and what you're here for. It's a good starting point"
@ -83,47 +110,239 @@ The output will be like:
[2] Prompt: 'The future of AI is', Generated text: ' not just about technology but also about how we choose to use it. We must ensure that AI is developed and deployed in a way that benefits all of humanity, not just a select few. This means prioritizing ethical considerations, transparency, and accountability in AI development. It also means involving diverse stakeholders in the conversation about AI'
```
#### PyTorch flow Quantization
### Running K-EXAONE
For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command:
K-EXAONE is a Mixture of Experts model that benefits from multiple parallelism strategies. You can run it with tensor parallelism (TP), expert parallelism (EP), and attention data parallelism (ADP):
```bash
git clone https://github.com/NVIDIA/Model-Optimizer.git
python ../../../llm-api/quickstart_advanced.py \
--model_dir $HF_MODEL_DIR \
--tp_size 8 \
--moe_ep_size 8 \
--enable_attention_dp \
--trust_remote_code
```
The output will be like:
```bash
[0] Prompt: 'Hello, my name is', Generated text: ' John Smith, and I am a 28-year-old software developer. I live in the city of San Francisco, California. I work remotely for a tech startup based in Austin, Texas.\n\nI enjoy hiking, reading, and playing the piano. In my free time, I often explore new neighborhoods in San Francisco, trying out new restaurants and cafes.\n\n'
[1] Prompt: 'The capital of France is', Generated text: ' Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris, the capital of France is Paris'
[2] Prompt: 'The future of AI is', Generated text: ' bright.\n</think>\n\nThe future of AI holds immense promise across numerous domains. In healthcare, AI is revolutionizing diagnostics, drug discovery, and personalized treatment plans. In education, AI is enabling adaptive learning platforms that cater to individual learning styles and paces. In environmental science, AI is playing a pivotal role in addressing climate change by optimizing'
```
#### MoE Backend Options
K-EXAONE supports the following MoE backends:
| Backend | Description |
|---------|-------------|
| `CUTLASS` | Default backend, optimized for general use cases |
| `TRTLLM` | TensorRT-LLM backend using TRT-LLM Gen kernels, optimized for low-latency inference |
| `WIDEEP` | Wide expert parallelism backend for cases where EP size exceeds the number of experts |
You can specify the MoE backend using the `--moe_backend` argument:
```bash
python ../../../llm-api/quickstart_advanced.py \
--model_dir $HF_MODEL_DIR \
--tp_size 8 \
--moe_ep_size 8 \
--enable_attention_dp \
--moe_backend CUTLASS \
--trust_remote_code
```
### PyTorch flow Quantization
For PyTorch flow, TRT-LLM supports quantized formats generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). You can either use pre-quantized models from the HuggingFace model hub, or generate quantized models yourself using the instructions below.
First, clone the [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) repository:
```bash
git clone https://github.com/NVIDIA/Model-Optimizer
cd Model-Optimizer/examples/llm_ptq
scripts/huggingface_example.sh --model hf_models/$MODEL_NAME --quant fp8 --export_fmt hf
```
For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer).
For more information, please refer to the official [Model Optimizer documentation](https://github.com/NVIDIA/Model-Optimizer).
Troubleshooting
#### FP8 Quantization
FP8 quantization provides a good balance between model accuracy and inference performance. To quantize a model to FP8 format:
The following error may occur during quantization:
```bash
torch._dynamo.exc.Unsupported: Graph break under GenericContextWrappingVariable
Explanation: Attempted to graph break in an active context manager(s) that doesn't support graph breaking.
Hint: Move the offending context manager(s) to outside the compiled region.
Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
python3 hf_ptq.py --model $HF_MODEL_DIR --quant fp8 --export_fmt hf
```
This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
#### NVFP4 Quantization
Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
```json
# generation_config.json
{
// Change "hybrid" to "dynamic" to run PTQ.
// Revert this to "hybrid" after quantization is complete.
"cache_implementation": "hybrid",
...
}
NVFP4 (4-bit floating point) quantization enables memory-efficient inference with reduced GPU memory footprint. To quantize a model to NVFP4 format:
```bash
python3 hf_ptq.py --model $HF_MODEL_DIR --quant nvfp4 --export_fmt hf
```
For models with sliding window attention, DynamicCache is less memory-efficient than HybridCache because it retains the entire key-value cache. However, this does not break the model's attention logic, as the cache implementation is separated from the attention computation itself. This trade-off is acceptable for the PTQ process, which is a one-time procedure. Our tests confirm that this workaround does not degrade accuracy on MMLU or GSM8K benchmarks with the default ModelOpt settings.
### TRT flow
## Running the TensorRT LLM Server
The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`.
This section describes how to deploy the K-EXAONE model using the TensorRT LLM server with an OpenAI-compatible API endpoint.
Make sure `HF_MODEL_DIR` points to your EXAONE checkpoint directory.
The examples in this section are intended as a minimal, runnable demonstration and are not fully performance-optimized. For more features and performance tuning, please refer the documents below.
- [Disaggregated Serving examples](../../../disaggregated/README.md)
- [Disaggregated Serving feature guide](../../../../docs/source/features/disagg-serving.md)
- [Recommended LLM API configuration settings](../../../configs/README.md) (see also `examples/configs/curated/`)
### Running Aggregated TensorRT LLM Server
The aggregated server runs all components (context and generation phases) on the same set of GPUs, which is suitable for single-node deployments.
#### Creating the Extra Options Configuration
Create a YAML configuration file to specify advanced options such as attention data parallelism, CUDA graph settings, and MoE backend configuration:
```bash
cat <<EOF > configs.yaml
enable_attention_dp: true
trust_remote_code: true
cuda_graph_config:
max_batch_size: 2048
enable_padding: true
moe_config:
backend: CUTLASS # The TRTLLM backend is recommended for the Blackwell architecture.
kv_cache_config:
enable_block_reuse: true # Please disable the block reuse feature when conducting performance benchmarking.
max_attention_window: [128, 128, 128, 131072] # This allows KV cache manager to possibly improve memory efficiency.
free_gpu_memory_fraction: 0.9
dtype: "auto"
attention_dp_config:
enable_balance: true
batching_wait_iters: 50
timeout_iters: 1
num_postprocess_workers: 4 # Can mitigate the postprocessing overhead (e.g. detokenization)
EOF
```
#### Launch trtllm-serve OpenAI-compatible API server
Start the server using `trtllm-serve` with the PyTorch backend. This launches an OpenAI-compatible API server that can handle chat completions and text generation requests:
```bash
trtllm-serve \
$HF_MODEL_DIR \
--host localhost \
--port 8000 \
--backend pytorch \
--max_batch_size 2048 \
--max_num_tokens 8192 \
--tp_size 8 \
--ep_size 8 \
--pp_size 1 \
--config ./configs.yaml
```
Once the server is running, you can send requests to `http://localhost:8000/v1/completions` using the OpenAI API format.
### Running Disaggregated TensorRT LLM Server
Disaggregated serving separates the context (prefill) and generation (decode) phases onto different GPU sets, enabling better resource utilization and improved throughput. This example demonstrates a single-node disaggregated deployment using 8 GPUs (4 for context, 4 for generation). For more details, see the [Disaggregated Serving documentation](../../../disaggregated/README.md).
#### Step 1: Set Environment Variables
Configure the parallelism and buffer settings:
```bash
# Buffer size for KV cache transfer between context and generation servers
export MAX_TOKENS_IN_BUFFER=8192
# Model parallelism configuration
export TP_SIZE=4
export MOE_EP_SIZE=4
export ENABLE_ATTENTION_DP=true
```
#### Step 2: Create Configuration Files
**Context server configuration (`ctx_extra-llm-api-config.yaml`):**
```bash
cat > ctx_extra-llm-api-config.yaml << EOF
backend: pytorch
trust_remote_code: true
disable_overlap_scheduler: true
enable_chunked_prefill: true
tensor_parallel_size: $TP_SIZE
moe_expert_parallel_size: $MOE_EP_SIZE
pipeline_parallel_size: 1
enable_attention_dp: $ENABLE_ATTENTION_DP
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: $MAX_TOKENS_IN_BUFFER
EOF
```
**Generation server configuration (`gen_extra-llm-api-config.yaml`):**
```bash
cat > gen_extra-llm-api-config.yaml << EOF
backend: pytorch
trust_remote_code: true
disable_overlap_scheduler: false
enable_chunked_prefill: true
tensor_parallel_size: $TP_SIZE
moe_expert_parallel_size: $MOE_EP_SIZE
pipeline_parallel_size: 1
enable_attention_dp: $ENABLE_ATTENTION_DP
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: $MAX_TOKENS_IN_BUFFER
EOF
```
**Disaggregated orchestrator configuration (`disagg_config.yaml`):**
```bash
cat > disagg_config.yaml << EOF
hostname: localhost
port: 8000
backend: pytorch
context_servers:
num_instances: 1
urls:
- "localhost:8001"
generation_servers:
num_instances: 1
urls:
- "localhost:8002"
EOF
```
#### Step 3: Launch the Disaggregated Server
Start all components in the following order:
```bash
# 1. Start context server (GPUs 0-3)
CUDA_VISIBLE_DEVICES=0,1,2,3 trtllm-serve $HF_MODEL_DIR \
--host localhost --port 8001 --enable_chunked_prefill \
--extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx.log &
# 2. Start generation server (GPUs 4-7)
CUDA_VISIBLE_DEVICES=4,5,6,7 trtllm-serve $HF_MODEL_DIR \
--host localhost --port 8002 --enable_chunked_prefill \
--extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen.log &
# 3. Start disaggregated orchestrator
trtllm-serve disaggregated -c disagg_config.yaml -t 360 -r 1200 &> log_disagg.log &
```
Once all servers are running, you can send requests to `http://localhost:8000/v1/completions` using the OpenAI API format.
## TRT flow
The next section describes how to convert weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT LLM format. We will use LLaMA's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE models and then build the model with `trtllm-build`.
### Convert checkpoint and build TensorRT engine(s)
@ -141,7 +360,7 @@ trtllm-build \
--output_dir trt_engines/exaone/fp16/1-gpu \
--gemm_plugin auto
# Build the EXAONE model using a single GPU and and apply INT8 weight-only quantization.
# Build the EXAONE model using a single GPU and apply INT8 weight-only quantization.
python ../llama/convert_checkpoint.py \
--model_dir $HF_MODEL_DIR \
--output_dir trt_models/exaone/int8_wq/1-gpu \
@ -154,7 +373,7 @@ trtllm-build \
--output_dir trt_engines/exaone/int8_wq/1-gpu \
--gemm_plugin auto
# Build the EXAONE model using a single GPU and and apply INT4 weight-only quantization.
# Build the EXAONE model using a single GPU and apply INT4 weight-only quantization.
python ../llama/convert_checkpoint.py \
--model_dir $HF_MODEL_DIR \
--output_dir trt_models/exaone/int4_wq/1-gpu \
@ -183,18 +402,18 @@ trtllm-build \
### FP8 Post-Training Quantization
The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
The examples below use the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
```bash
# Build the EXAONE model using a single GPU and and apply FP8 quantization.
# Build the EXAONE model using a single GPU and apply FP8 quantization.
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
--output_dir trt_models/exaone/fp8/1-gpu \
--output_dir trt_models/exaone/fp8/1-gpu
trtllm-build \
--checkpoint_dir trt_models/exaone/fp8/1-gpu \
@ -204,12 +423,12 @@ trtllm-build \
### SmoothQuant
The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
The examples below use the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
```bash
# Build the EXAONE model using a single GPU and and apply INT8 SmoothQuant.
# Build the EXAONE model using a single GPU and apply INT8 SmoothQuant.
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
@ -224,12 +443,12 @@ trtllm-build \
### Groupwise quantization (AWQ)
The examples below uses the NVIDIA Modelopt (AlgorithMic Model Optimization) toolkit for the model quantization process.
The examples below use the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
```bash
# Build the EXAONE model using a single GPU and and apply INT4 AWQ.
# Build the EXAONE model using a single GPU and apply INT4 AWQ.
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
@ -248,7 +467,7 @@ For Hopper GPUs, TRT-LLM also supports employing FP8 GEMM for accelerating linea
Please make sure your system contains a Hopper GPU before trying the commands below.
```bash
# Build the EXAONE model using a single GPU and and apply W4A8 AWQ.
# Build the EXAONE model using a single GPU and apply W4A8 AWQ.
python ../../../quantization/quantize.py \
--model_dir $HF_MODEL_DIR \
--dtype float16 \
@ -287,4 +506,50 @@ python ../../../summarize.py \
--engine_dir trt_engines/exaone/fp16/1-gpu
```
For more examples see [`examples/models/core/llama/README.md`](../llama/README.md)
For more examples regarding EXAONE-3.0 & EXAONE-Deep's TRT flow, see [`examples/models/core/llama/README.md`](../llama/README.md)
## Troubleshootings
### Troubleshootings for EXAONE-4.0
The following error may occur during quantization:
```bash
torch._dynamo.exc.Unsupported: Graph break under GenericContextWrappingVariable
Explanation: Attempted to graph break in an active context manager(s) that doesn't support graph breaking.
Hint: Move the offending context manager(s) to outside the compiled region.
Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
```
This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache.
Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example:
```json
# generation_config.json
{
// Change "hybrid" to "dynamic" to run PTQ.
// Revert this to "hybrid" after quantization is complete.
"cache_implementation": "hybrid",
...
}
```
For models with sliding window attention, DynamicCache is less memory-efficient than HybridCache because it retains the entire key-value cache. However, this does not break the model's attention logic, as the cache implementation is separated from the attention computation itself. This trade-off is acceptable for the PTQ process, which is a one-time procedure. Our tests confirm that this workaround does not degrade accuracy on MMLU or GSM8K benchmarks with the default ModelOpt settings.
### Troubleshootings for K-EXAONE
K-EXAONE is a Mixture of Experts (MoE) model which activates 8 experts per token. When not enough tokens are given during the PTQ, some experts on some layers might not be activated and will not produce proper weights.
To address this issue, provide enough data samples during calibration by increasing `calib_size` and `calib_seq` parameters:
**FP8 Quantization:**
```bash
cd Model-Optimizer/examples/llm_ptq
python3 hf_ptq.py --model hf_models/$MODEL_NAME --quant fp8 --export_fmt hf --calib_size 8192 --calib_seq 1024
```
**NVFP4 Quantization:**
```bash
cd Model-Optimizer/examples/llm_ptq
python3 hf_ptq.py --model hf_models/$MODEL_NAME --quant nvfp4 --export_fmt hf --calib_size 8192 --calib_seq 1024
```

View File

@ -10,7 +10,7 @@ tiktoken
einops
# optional dependencies
gradio==4.44.1
gradio==5.4.0
mdtex2html
sse_starlette
aiohttp_sse_client

View File

@ -1155,7 +1155,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
export pytestCommand="$pytestCommand"
export coverageConfigFile="$coverageConfigFile"
export NVIDIA_IMEX_CHANNELS=\${NVIDIA_IMEX_CHANNELS:-0}
export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))}
export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=csv,noheader)-1)))}
${envExportStatements}
echo "Env NVIDIA_IMEX_CHANNELS: \$NVIDIA_IMEX_CHANNELS"
@ -3249,10 +3249,12 @@ def launchTestJobs(pipeline, testFilter)
fullSet = parallelJobs.keySet()
x86SlurmTestConfigs = [
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 2, 2],
"DGX_H100-2_GPUs-PyTorch-Others-2": ["dgx-h100-x2-oci", "l0_dgx_h100", 2, 2, 2],
"DGX_H100-2_GPUs-PyTorch-GptOss-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 2, 4],
"DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4-oci", "l0_dgx_h100", 2, 2, 4],
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],

View File

@ -0,0 +1,111 @@
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
import java.lang.InterruptedException
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202510291120-8621"
// LLM repository configuration
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
}
LLM_ROOT = "llm"
def createKubernetesPodConfig(image, arch = "amd64")
{
def archSuffix = arch == "arm64" ? "arm" : "amd"
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
def podConfig = [
cloud: "kubernetes-cpu",
namespace: "sw-tensorrt",
yaml: """
apiVersion: v1
kind: Pod
spec:
nodeSelector:
nvidia.com/node_type: builder
kubernetes.io/os: linux
containers:
- name: trt-llm
image: ${image}
command: ['cat']
volumeMounts:
- name: sw-tensorrt-pvc
mountPath: "/mnt/sw-tensorrt-pvc"
readOnly: false
tty: true
resources:
requests:
cpu: 2
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: 2
memory: 5Gi
ephemeral-storage: 25Gi
imagePullPolicy: Always
- name: jnlp
image: ${jnlpImage}
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
resources:
requests:
cpu: '2'
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 5Gi
ephemeral-storage: 25Gi
qosClass: Guaranteed
volumes:
- name: sw-tensorrt-pvc
persistentVolumeClaim:
claimName: sw-tensorrt-pvc
""".stripIndent(),
]
return podConfig
}
pipeline {
agent {
kubernetes createKubernetesPodConfig(DOCKER_IMAGE)
}
options {
timestamps()
}
environment {
OPEN_SEARCH_DB_BASE_URL=credentials("open_search_db_base_url")
OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
}
parameters {
string(name: "BRANCH", defaultValue: "main", description: "Branch to checkout.")
string(name: "OPEN_SEARCH_PROJECT_NAME", defaultValue: "swdl-trtllm-infra-ci-prod-perf_sanity_info", description: "OpenSearch project name.")
string(name: "OPERATION", defaultValue: "SLACK BOT SENDS MESSAGE", description: "Operation to perform.")
string(name: "QUERY_JOB_NUMBER", defaultValue: "1", description: "Number of latest jobs to query.")
string(name: "SLACK_CHANNEL_ID", defaultValue: "C0A7D0LCA1F", description: "Slack channel IDs to send messages to.")
string(name: "SLACK_BOT_TOKEN", defaultValue: "", description: "Slack bot token for authentication.")
}
stages {
stage("Run Perf Sanity Script") {
steps {
container("trt-llm") {
script {
sh "pwd && ls -alh"
sh "env | sort"
trtllm_utils.checkoutSource(LLM_REPO, params.BRANCH, LLM_ROOT, false, false)
sh "pip install slack_sdk"
sh """
cd ${LLM_ROOT}/jenkins/scripts/perf && ls -alh && python3 perf_sanity_triage.py \
--project_name "${params.OPEN_SEARCH_PROJECT_NAME}" \
--operation "${params.OPERATION}" \
--channel_id "${params.SLACK_CHANNEL_ID}" \
--bot_token "${params.SLACK_BOT_TOKEN}" \
--query_job_number "${params.QUERY_JOB_NUMBER}"
"""
}
}
}
} // stage Run Perf Sanity Script
} // stages
} // pipeline

View File

@ -0,0 +1,251 @@
#!/usr/bin/env python3
import argparse
import json
import sys
import time
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
sys.path.insert(0, sys.path[0] + "/..")
from open_search_db import OpenSearchDB
QUERY_LOOKBACK_DAYS = 90
MAX_QUERY_SIZE = 3000
MAX_TEST_CASES_PER_MSG = 5
POST_SLACK_MSG_RETRY_TIMES = 5
def query_regression_data(project_name):
"""Query regression data from OpenSearch database."""
last_days = QUERY_LOOKBACK_DAYS
must_clauses = [
{"term": {"b_is_valid": True}},
{"term": {"b_is_post_merge": True}},
{"term": {"b_is_regression": True}},
{"term": {"b_is_baseline": False}},
{
"range": {
"ts_created": {
"gte": int(time.time() - 24 * 3600 * last_days)
// (24 * 3600)
* 24
* 3600
* 1000,
}
}
},
]
json_data = {
"query": {
"bool": {"must": must_clauses},
},
"size": MAX_QUERY_SIZE,
}
json_data = json.dumps(json_data)
data_list = []
try:
res = OpenSearchDB.queryFromOpenSearchDB(json_data, project_name)
if res is None:
print(f"Failed to query from {project_name}, returned no response")
return None
payload = res.json().get("hits", {}).get("hits", [])
if len(payload) == 0:
print(f"No regression data found in {project_name}, returned empty list")
return []
for hit in payload:
data_dict = hit.get("_source", {})
data_dict["_id"] = hit.get("_id", "")
if data_dict["_id"] == "":
print(f"Failed to query from {project_name}, returned data with no _id")
return None
data_list.append(data_dict)
print(f"Successfully queried from {project_name}, queried {len(data_list)} entries")
return data_list
except Exception as e:
print(f"Failed to query from {project_name}, returned error: {e}")
return None
def get_regression_data_by_job_id(data_list, query_job_number):
"""Returns a dict with job_id as key and list of regression data as value.
Only returns the latest query_job_number jobs.
"""
if data_list is None or len(data_list) == 0:
return {}
# Group data by job_id
job_data_dict = {}
for data in data_list:
job_id = data.get("s_job_id", "")
if job_id == "":
continue
if job_id not in job_data_dict:
job_data_dict[job_id] = []
job_data_dict[job_id].append(data)
# Sort job_ids by the latest ts_created in each group (descending)
def get_latest_timestamp(job_id):
timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]]
return max(timestamps) if timestamps else 0
sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True)
# Only keep the latest query_job_number jobs
latest_job_ids = sorted_job_ids[:query_job_number]
result = {}
for job_id in latest_job_ids:
result[job_id] = job_data_dict[job_id]
return result
def process_regression_message(regression_dict):
"""Process regression data into message chunks.
Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases.
"""
if not regression_dict:
return []
# Flatten all test cases into a list with (job_id, idx, data) tuples
all_test_cases = []
for job_id, data_list in regression_dict.items():
sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", ""))
for idx, data in enumerate(sorted_data_list, start=1):
all_test_cases.append((job_id, idx, data))
# Split into chunks of MAX_TEST_CASES_PER_MSG
chunks = []
for i in range(0, len(all_test_cases), MAX_TEST_CASES_PER_MSG):
chunks.append(all_test_cases[i : i + MAX_TEST_CASES_PER_MSG])
# Build messages for each chunk
messages = []
for chunk in chunks:
msg_parts = []
current_job_id = None
for job_id, idx, data in chunk:
# Add job header when switching to a new job_id
if job_id != current_job_id:
if msg_parts:
msg_parts.append("\n")
job_header = f"*LLM/main/L0_PostMerge/{job_id}:*\n"
msg_parts.append(job_header)
current_job_id = job_id
test_case_name = data.get("s_test_case_name", "N/A")
regression_info = data.get("s_regression_info", "N/A")
msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n")
for part in regression_info.split(","):
part = part.strip()
if part and "baseline_id" not in part:
msg_parts.append(f" {part}\n")
msg = "".join(msg_parts).strip()
messages.append(msg)
return messages
def send_regression_message(messages, channel_id, bot_token):
"""Send regression messages to Slack channel(s).
channel_id can be a single ID or multiple IDs separated by commas.
"""
if not messages:
print("No regression data to send")
return
if channel_id and bot_token:
channel_ids = [cid.strip() for cid in channel_id.split(",") if cid.strip()]
for cid in channel_ids:
for msg in messages:
send_message(msg, cid, bot_token)
else:
print("Slack channel_id or bot_token not provided, printing message:")
for i, msg in enumerate(messages, start=1):
print(f"--- Message {i} ---")
print(msg)
def send_message(msg, channel_id, bot_token):
"""Send message to Slack channel using slack_sdk."""
client = WebClient(token=bot_token)
attachments = [
{
"title": "Perf Sanity Regression Report",
"color": "#ff0000",
"text": msg,
}
]
for attempt in range(1, POST_SLACK_MSG_RETRY_TIMES + 1):
try:
result = client.chat_postMessage(
channel=channel_id,
attachments=attachments,
)
assert result["ok"] is True, json.dumps(result.data)
print(f"Message sent successfully to channel {channel_id}")
return
except SlackApiError as e:
print(
f"Attempt {attempt}/{POST_SLACK_MSG_RETRY_TIMES}: Error sending message to Slack: {e}"
)
except Exception as e:
print(f"Attempt {attempt}/{POST_SLACK_MSG_RETRY_TIMES}: Unexpected error: {e}")
if attempt < POST_SLACK_MSG_RETRY_TIMES:
time.sleep(1)
print(
f"Failed to send message to channel {channel_id} after {POST_SLACK_MSG_RETRY_TIMES} attempts"
)
def main():
parser = argparse.ArgumentParser(description="Perf Sanity Triage Script")
parser.add_argument("--project_name", type=str, required=True, help="OpenSearch project name")
parser.add_argument("--operation", type=str, required=True, help="Operation to perform")
parser.add_argument(
"--channel_id",
type=str,
default="",
help="Slack channel ID(s), comma-separated for multiple channels",
)
parser.add_argument("--bot_token", type=str, default="", help="Slack bot token")
parser.add_argument(
"--query_job_number", type=int, default=1, help="Number of latest jobs to query"
)
args = parser.parse_args()
print(f"Project Name: {args.project_name}")
print(f"Operation: {args.operation}")
print(f"Channel ID: {args.channel_id}")
print(f"Bot Token: {'***' if args.bot_token else 'Not provided'}")
print(f"Query Job Number: {args.query_job_number}")
if args.operation == "SLACK BOT SENDS MESSAGE":
data_list = query_regression_data(args.project_name)
if data_list is None:
print("Failed to query regression data")
return
regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number)
messages = process_regression_message(regression_dict)
send_regression_message(messages, args.channel_id, args.bot_token)
else:
print(f"Unknown operation: {args.operation}")
if __name__ == "__main__":
main()

View File

@ -40,6 +40,14 @@ TEST_LIST_PATH = (
REPO_ROOT / "tests" / "integration" / "test_lists" / "qa" / "llm_config_database.yml"
)
ITERATIONS = 10
# Mapping from HuggingFace model IDs to MODEL_PATH_DICT keys used by the test framework
# in tests/integration/defs/perf/test_perf_sanity.py
MODEL_NAME_MAPPING = {
"deepseek-ai/DeepSeek-R1-0528": "deepseek_r1_0528_fp8",
"nvidia/DeepSeek-R1-0528-FP4-v2": "deepseek_r1_0528_fp4_v2",
"openai/gpt-oss-120b": "gpt_oss_120b_fp4",
}
# GPU type to condition wildcards mapping for test list
# Note: cpu is used to distinguish between e.g. H200_SXM and GH200
@ -65,9 +73,13 @@ def generate_client_name(recipe: Recipe) -> str:
def recipe_to_server_config(recipe: Recipe, llm_api_config: dict) -> dict:
"""Convert a recipe + LLM API config to aggr_server format."""
model_name = MODEL_NAME_MAPPING.get(recipe.model)
if not model_name:
raise ValueError(f"Model not found in MODEL_NAME_MAPPING: {recipe.model}")
server_config = {
"name": generate_server_name(recipe),
"model_name": recipe.model,
"model_name": model_name,
"gpus": recipe.num_gpus,
# Enable scenario-only matching for baseline comparison
"match_mode": "scenario",
@ -157,7 +169,7 @@ def generate_condition_entry(
}
tests = [
f"perf/test_perf.py::test_perf[perf_sanity_upload-{config_name}-{name}]"
f"perf/test_perf_sanity.py::test_e2e[aggr_upload-{config_name}-{name}]"
for name in server_names
]
return {"condition": condition, "tests": tests}

View File

@ -150,53 +150,58 @@ testing = ["filelock"]
[[package]]
name = "tomli"
version = "2.3.0"
version = "2.4.0"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
files = [
{file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"},
{file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"},
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"},
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"},
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"},
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"},
{file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"},
{file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"},
{file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"},
{file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"},
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"},
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"},
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"},
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"},
{file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"},
{file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"},
{file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"},
{file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"},
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"},
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"},
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"},
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"},
{file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"},
{file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"},
{file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"},
{file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"},
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"},
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"},
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"},
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"},
{file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"},
{file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"},
{file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"},
{file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"},
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"},
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"},
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"},
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"},
{file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"},
{file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"},
{file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"},
{file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"},
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"},
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"},
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"},
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"},
{file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"},
{file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"},
{file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"},
{file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"},
{file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"},
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"},
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"},
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"},
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"},
{file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"},
{file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"},
{file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"},
{file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"},
{file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"},
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"},
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"},
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"},
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"},
{file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"},
{file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"},
{file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"},
{file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"},
{file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"},
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"},
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"},
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"},
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"},
{file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"},
{file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"},
{file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"},
{file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"},
{file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"},
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"},
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"},
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"},
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"},
{file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"},
{file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"},
{file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"},
{file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"},
{file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"},
]
[[package]]

View File

@ -1119,53 +1119,58 @@ test = ["pytest"]
[[package]]
name = "tomli"
version = "2.3.0"
version = "2.4.0"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
files = [
{file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"},
{file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"},
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"},
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"},
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"},
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"},
{file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"},
{file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"},
{file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"},
{file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"},
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"},
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"},
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"},
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"},
{file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"},
{file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"},
{file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"},
{file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"},
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"},
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"},
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"},
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"},
{file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"},
{file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"},
{file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"},
{file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"},
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"},
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"},
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"},
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"},
{file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"},
{file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"},
{file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"},
{file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"},
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"},
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"},
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"},
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"},
{file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"},
{file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"},
{file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"},
{file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"},
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"},
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"},
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"},
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"},
{file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"},
{file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"},
{file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"},
{file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"},
{file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"},
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"},
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"},
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"},
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"},
{file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"},
{file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"},
{file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"},
{file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"},
{file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"},
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"},
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"},
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"},
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"},
{file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"},
{file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"},
{file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"},
{file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"},
{file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"},
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"},
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"},
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"},
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"},
{file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"},
{file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"},
{file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"},
{file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"},
{file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"},
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"},
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"},
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"},
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"},
{file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"},
{file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"},
{file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"},
{file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"},
{file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"},
]
[[package]]

View File

@ -2396,80 +2396,80 @@ test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis
[[package]]
name = "scipy"
version = "1.16.3"
version = "1.17.0"
description = "Fundamental algorithms for scientific computing in Python"
optional = false
python-versions = ">=3.11"
files = [
{file = "scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97"},
{file = "scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511"},
{file = "scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005"},
{file = "scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb"},
{file = "scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876"},
{file = "scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2"},
{file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e"},
{file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733"},
{file = "scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78"},
{file = "scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184"},
{file = "scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6"},
{file = "scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07"},
{file = "scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9"},
{file = "scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686"},
{file = "scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203"},
{file = "scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1"},
{file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe"},
{file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70"},
{file = "scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc"},
{file = "scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2"},
{file = "scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c"},
{file = "scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d"},
{file = "scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9"},
{file = "scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4"},
{file = "scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959"},
{file = "scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88"},
{file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234"},
{file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d"},
{file = "scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304"},
{file = "scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2"},
{file = "scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b"},
{file = "scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079"},
{file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a"},
{file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119"},
{file = "scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c"},
{file = "scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e"},
{file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135"},
{file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6"},
{file = "scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc"},
{file = "scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a"},
{file = "scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6"},
{file = "scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657"},
{file = "scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26"},
{file = "scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc"},
{file = "scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22"},
{file = "scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc"},
{file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0"},
{file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800"},
{file = "scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d"},
{file = "scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f"},
{file = "scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c"},
{file = "scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40"},
{file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d"},
{file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa"},
{file = "scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8"},
{file = "scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353"},
{file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146"},
{file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d"},
{file = "scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7"},
{file = "scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562"},
{file = "scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb"},
{file = "scipy-1.17.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:2abd71643797bd8a106dff97894ff7869eeeb0af0f7a5ce02e4227c6a2e9d6fd"},
{file = "scipy-1.17.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ef28d815f4d2686503e5f4f00edc387ae58dfd7a2f42e348bb53359538f01558"},
{file = "scipy-1.17.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:272a9f16d6bb4667e8b50d25d71eddcc2158a214df1b566319298de0939d2ab7"},
{file = "scipy-1.17.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7204fddcbec2fe6598f1c5fdf027e9f259106d05202a959a9f1aecf036adc9f6"},
{file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fc02c37a5639ee67d8fb646ffded6d793c06c5622d36b35cfa8fe5ececb8f042"},
{file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dac97a27520d66c12a34fd90a4fe65f43766c18c0d6e1c0a80f114d2260080e4"},
{file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb7446a39b3ae0fe8f416a9a3fdc6fba3f11c634f680f16a239c5187bc487c0"},
{file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:474da16199f6af66601a01546144922ce402cb17362e07d82f5a6cf8f963e449"},
{file = "scipy-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:255c0da161bd7b32a6c898e7891509e8a9289f0b1c6c7d96142ee0d2b114c2ea"},
{file = "scipy-1.17.0-cp311-cp311-win_arm64.whl", hash = "sha256:85b0ac3ad17fa3be50abd7e69d583d98792d7edc08367e01445a1e2076005379"},
{file = "scipy-1.17.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:0d5018a57c24cb1dd828bcf51d7b10e65986d549f52ef5adb6b4d1ded3e32a57"},
{file = "scipy-1.17.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:88c22af9e5d5a4f9e027e26772cc7b5922fab8bcc839edb3ae33de404feebd9e"},
{file = "scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f3cd947f20fe17013d401b64e857c6b2da83cae567adbb75b9dcba865abc66d8"},
{file = "scipy-1.17.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e8c0b331c2c1f531eb51f1b4fc9ba709521a712cce58f1aa627bc007421a5306"},
{file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5194c445d0a1c7a6c1a4a4681b6b7c71baad98ff66d96b949097e7513c9d6742"},
{file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9eeb9b5f5997f75507814ed9d298ab23f62cf79f5a3ef90031b1ee2506abdb5b"},
{file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:40052543f7bbe921df4408f46003d6f01c6af109b9e2c8a66dd1cf6cf57f7d5d"},
{file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0cf46c8013fec9d3694dc572f0b54100c28405d55d3e2cb15e2895b25057996e"},
{file = "scipy-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:0937a0b0d8d593a198cededd4c439a0ea216a3f36653901ea1f3e4be949056f8"},
{file = "scipy-1.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:f603d8a5518c7426414d1d8f82e253e454471de682ce5e39c29adb0df1efb86b"},
{file = "scipy-1.17.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:65ec32f3d32dfc48c72df4291345dae4f048749bc8d5203ee0a3f347f96c5ce6"},
{file = "scipy-1.17.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:1f9586a58039d7229ce77b52f8472c972448cded5736eaf102d5658bbac4c269"},
{file = "scipy-1.17.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9fad7d3578c877d606b1150135c2639e9de9cecd3705caa37b66862977cc3e72"},
{file = "scipy-1.17.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:423ca1f6584fc03936972b5f7c06961670dbba9f234e71676a7c7ccf938a0d61"},
{file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe508b5690e9eaaa9467fc047f833af58f1152ae51a0d0aed67aa5801f4dd7d6"},
{file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752"},
{file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eec3842ec9ac9de5917899b277428886042a93db0b227ebbe3a333b64ec7643d"},
{file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d7425fcafbc09a03731e1bc05581f5fad988e48c6a861f441b7ab729a49a55ea"},
{file = "scipy-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:87b411e42b425b84777718cc41516b8a7e0795abfa8e8e1d573bf0ef014f0812"},
{file = "scipy-1.17.0-cp313-cp313-win_arm64.whl", hash = "sha256:357ca001c6e37601066092e7c89cca2f1ce74e2a520ca78d063a6d2201101df2"},
{file = "scipy-1.17.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:ec0827aa4d36cb79ff1b81de898e948a51ac0b9b1c43e4a372c0508c38c0f9a3"},
{file = "scipy-1.17.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:819fc26862b4b3c73a60d486dbb919202f3d6d98c87cf20c223511429f2d1a97"},
{file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:363ad4ae2853d88ebcde3ae6ec46ccca903ea9835ee8ba543f12f575e7b07e4e"},
{file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:979c3a0ff8e5ba254d45d59ebd38cde48fce4f10b5125c680c7a4bfe177aab07"},
{file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:130d12926ae34399d157de777472bf82e9061c60cc081372b3118edacafe1d00"},
{file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e886000eb4919eae3a44f035e63f0fd8b651234117e8f6f29bad1cd26e7bc45"},
{file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:13c4096ac6bc31d706018f06a49abe0485f96499deb82066b94d19b02f664209"},
{file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cacbaddd91fcffde703934897c5cd2c7cb0371fac195d383f4e1f1c5d3f3bd04"},
{file = "scipy-1.17.0-cp313-cp313t-win_amd64.whl", hash = "sha256:edce1a1cf66298cccdc48a1bdf8fb10a3bf58e8b58d6c3883dd1530e103f87c0"},
{file = "scipy-1.17.0-cp313-cp313t-win_arm64.whl", hash = "sha256:30509da9dbec1c2ed8f168b8d8aa853bc6723fede1dbc23c7d43a56f5ab72a67"},
{file = "scipy-1.17.0-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:c17514d11b78be8f7e6331b983a65a7f5ca1fd037b95e27b280921fe5606286a"},
{file = "scipy-1.17.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:4e00562e519c09da34c31685f6acc3aa384d4d50604db0f245c14e1b4488bfa2"},
{file = "scipy-1.17.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f7df7941d71314e60a481e02d5ebcb3f0185b8d799c70d03d8258f6c80f3d467"},
{file = "scipy-1.17.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:aabf057c632798832f071a8dde013c2e26284043934f53b00489f1773b33527e"},
{file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a38c3337e00be6fd8a95b4ed66b5d988bac4ec888fd922c2ea9fe5fb1603dd67"},
{file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00fb5f8ec8398ad90215008d8b6009c9db9fa924fd4c7d6be307c6f945f9cd73"},
{file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f2a4942b0f5f7c23c7cd641a0ca1955e2ae83dedcff537e3a0259096635e186b"},
{file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:dbf133ced83889583156566d2bdf7a07ff89228fe0c0cb727f777de92092ec6b"},
{file = "scipy-1.17.0-cp314-cp314-win_amd64.whl", hash = "sha256:3625c631a7acd7cfd929e4e31d2582cf00f42fcf06011f59281271746d77e061"},
{file = "scipy-1.17.0-cp314-cp314-win_arm64.whl", hash = "sha256:9244608d27eafe02b20558523ba57f15c689357c85bdcfe920b1828750aa26eb"},
{file = "scipy-1.17.0-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:2b531f57e09c946f56ad0b4a3b2abee778789097871fc541e267d2eca081cff1"},
{file = "scipy-1.17.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:13e861634a2c480bd237deb69333ac79ea1941b94568d4b0efa5db5e263d4fd1"},
{file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:eb2651271135154aa24f6481cbae5cc8af1f0dd46e6533fb7b56aa9727b6a232"},
{file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:c5e8647f60679790c2f5c76be17e2e9247dc6b98ad0d3b065861e082c56e078d"},
{file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fb10d17e649e1446410895639f3385fd2bf4c3c7dfc9bea937bddcbc3d7b9ba"},
{file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8547e7c57f932e7354a2319fab613981cde910631979f74c9b542bb167a8b9db"},
{file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33af70d040e8af9d5e7a38b5ed3b772adddd281e3062ff23fec49e49681c38cf"},
{file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb55bb97d00f8b7ab95cb64f873eb0bf54d9446264d9f3609130381233483f"},
{file = "scipy-1.17.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1ff269abf702f6c7e67a4b7aad981d42871a11b9dd83c58d2d2ea624efbd1088"},
{file = "scipy-1.17.0-cp314-cp314t-win_arm64.whl", hash = "sha256:031121914e295d9791319a1875444d55079885bbae5bdc9c5e0f2ee5f09d34ff"},
{file = "scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e"},
]
[package.dependencies]
numpy = ">=1.25.2,<2.6"
numpy = ">=1.26.4,<2.7"
[package.extras]
dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"]
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"]
dev = ["click (<8.3.0)", "cython-lint (>=0.12.2)", "mypy (==1.10.0)", "pycodestyle", "ruff (>=0.12.0)", "spin", "types-psutil", "typing_extensions"]
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)", "tabulate"]
test = ["Cython", "array-api-strict (>=2.3.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest (>=8.0.0)", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
[[package]]

View File

@ -42,6 +42,40 @@ files = [
{file = "av-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:273a3e32de64819e4a1cd96341824299fe06f70c46f2288b5dc4173944f0fd62"},
{file = "av-16.1.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:640f57b93f927fba8689f6966c956737ee95388a91bd0b8c8b5e0481f73513d6"},
{file = "av-16.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ae3fb658eec00852ebd7412fdc141f17f3ddce8afee2d2e1cf366263ad2a3b35"},
{file = "av-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:27ee558d9c02a142eebcbe55578a6d817fedfde42ff5676275504e16d07a7f86"},
{file = "av-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7ae547f6d5fa31763f73900d43901e8c5fa6367bb9a9840978d57b5a7ae14ed2"},
{file = "av-16.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8cf065f9d438e1921dc31fc7aa045790b58aee71736897866420d80b5450f62a"},
{file = "av-16.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a345877a9d3cc0f08e2bc4ec163ee83176864b92587afb9d08dff50f37a9a829"},
{file = "av-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:f49243b1d27c91cd8c66fdba90a674e344eb8eb917264f36117bf2b6879118fd"},
{file = "av-16.1.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:ce2a1b3d8bf619f6c47a9f28cfa7518ff75ddd516c234a4ee351037b05e6a587"},
{file = "av-16.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:408dbe6a2573ca58a855eb8cd854112b33ea598651902c36709f5f84c991ed8e"},
{file = "av-16.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:57f657f86652a160a8a01887aaab82282f9e629abf94c780bbdbb01595d6f0f7"},
{file = "av-16.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:adbad2b355c2ee4552cac59762809d791bda90586d134a33c6f13727fb86cb3a"},
{file = "av-16.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f42e1a68ec2aebd21f7eb6895be69efa6aa27eec1670536876399725bbda4b99"},
{file = "av-16.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58fe47aeaef0f100c40ec8a5de9abbd37f118d3ca03829a1009cf288e9aef67c"},
{file = "av-16.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:565093ebc93b2f4b76782589564869dadfa83af5b852edebedd8fee746457d06"},
{file = "av-16.1.0-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:574081a24edb98343fd9f473e21ae155bf61443d4ec9d7708987fa597d6b04b2"},
{file = "av-16.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:9ab00ea29c25ebf2ea1d1e928d7babb3532d562481c5d96c0829212b70756ad0"},
{file = "av-16.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a84a91188c1071f238a9523fd42dbe567fb2e2607b22b779851b2ce0eac1b560"},
{file = "av-16.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:c2cd0de4dd022a7225ff224fde8e7971496d700be41c50adaaa26c07bb50bf97"},
{file = "av-16.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0816143530624a5a93bc5494f8c6eeaf77549b9366709c2ac8566c1e9bff6df5"},
{file = "av-16.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e3a28053af29644696d0c007e897d19b1197585834660a54773e12a40b16974c"},
{file = "av-16.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e3e67144a202b95ed299d165232533989390a9ea3119d37eccec697dc6dbb0c"},
{file = "av-16.1.0-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:39a634d8e5a87e78ea80772774bfd20c0721f0d633837ff185f36c9d14ffede4"},
{file = "av-16.1.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:0ba32fb9e9300948a7fa9f8a3fc686e6f7f77599a665c71eb2118fdfd2c743f9"},
{file = "av-16.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:ca04d17815182d34ce3edc53cbda78a4f36e956c0fd73e3bab249872a831c4d7"},
{file = "av-16.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ee0e8de2e124a9ef53c955fe2add6ee7c56cc8fd83318265549e44057db77142"},
{file = "av-16.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:22bf77a2f658827043a1e184b479c3bf25c4c43ab32353677df2d119f080e28f"},
{file = "av-16.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2dd419d262e6a71cab206d80bbf28e0a10d0f227b671cdf5e854c028faa2d043"},
{file = "av-16.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:53585986fd431cd436f290fba662cfb44d9494fbc2949a183de00acc5b33fa88"},
{file = "av-16.1.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:76f5ed8495cf41e1209a5775d3699dc63fdc1740b94a095e2485f13586593205"},
{file = "av-16.1.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:8d55397190f12a1a3ae7538be58c356cceb2bf50df1b33523817587748ce89e5"},
{file = "av-16.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:9d51d9037437218261b4bbf9df78a95e216f83d7774fbfe8d289230b5b2e28e2"},
{file = "av-16.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0ce07a89c15644407f49d942111ca046e323bbab0a9078ff43ee57c9b4a50dad"},
{file = "av-16.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:cac0c074892ea97113b53556ff41c99562db7b9f09f098adac1f08318c2acad5"},
{file = "av-16.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7dec3dcbc35a187ce450f65a2e0dda820d5a9e6553eea8344a1459af11c98649"},
{file = "av-16.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6f90dc082ff2068ddbe77618400b44d698d25d9c4edac57459e250c16b33d700"},
{file = "av-16.1.0.tar.gz", hash = "sha256:a094b4fd87a3721dacf02794d3d2c82b8d712c85b9534437e82a8a978c175ffd"},
]
[[package]]

View File

@ -1,4 +1,4 @@
{
"commit_hash": "ff7eb93f310d36f62b79ff5e229935bf50b934e7",
"timestamp": "2026-01-10T02:39:45Z"
"commit_hash": "c0e25e54181528c8e0818e2e9bc22fe5a889b8cc",
"timestamp": "2026-01-12T02:39:25Z"
}

View File

@ -5540,53 +5540,58 @@ testing = ["datasets", "numpy", "pytest", "pytest-asyncio", "requests", "ruff",
[[package]]
name = "tomli"
version = "2.3.0"
version = "2.4.0"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
files = [
{file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"},
{file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"},
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"},
{file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"},
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"},
{file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"},
{file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"},
{file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"},
{file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"},
{file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"},
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"},
{file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"},
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"},
{file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"},
{file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"},
{file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"},
{file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"},
{file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"},
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"},
{file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"},
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"},
{file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"},
{file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"},
{file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"},
{file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"},
{file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"},
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"},
{file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"},
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"},
{file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"},
{file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"},
{file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"},
{file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"},
{file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"},
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"},
{file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"},
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"},
{file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"},
{file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"},
{file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"},
{file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"},
{file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"},
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"},
{file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"},
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"},
{file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"},
{file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"},
{file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"},
{file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"},
{file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"},
{file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"},
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"},
{file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"},
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"},
{file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"},
{file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"},
{file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"},
{file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"},
{file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"},
{file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"},
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"},
{file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"},
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"},
{file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"},
{file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"},
{file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"},
{file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"},
{file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"},
{file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"},
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"},
{file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"},
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"},
{file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"},
{file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"},
{file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"},
{file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"},
{file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"},
{file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"},
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"},
{file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"},
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"},
{file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"},
{file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"},
{file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"},
{file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"},
{file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"},
{file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"},
]
[[package]]

View File

@ -316,32 +316,50 @@ class ModelConfig(Generic[TConfig]):
quant_config = QuantConfig()
layer_quant_config = None
# Read exclude_modules from HF config if present (HF format module names)
hf_exclude_modules = hf_quant_config.get('modules_to_not_convert', None)
# DeepSeek V3 FP8 ckpt
if hf_quant_config.get("quant_method") == "fp8" and hf_quant_config.get(
"weight_block_size", []):
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
if moe_backend == 'TRTLLM':
# TODO: This is a hack. Remove after fp8 bmm is integrated.
quant_config.exclude_modules = [
"*kv_b_proj*", "*k_b_proj*", "*eh_proj"
]
else:
quant_config.exclude_modules = ["*eh_proj"]
block_size = hf_quant_config.get("weight_block_size", [])
assert tuple(block_size) == (
128, 128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
quant_config.group_size = block_size[0]
# Set default exclude_modules for FP8_BLOCK_SCALES
if moe_backend == 'TRTLLM':
default_exclude = ["*kv_b_proj*", "*k_b_proj*", "*eh_proj"]
else:
default_exclude = ["*eh_proj"]
# Merge HF config's modules_to_not_convert with default exclude_modules
if hf_exclude_modules is not None:
quant_config.exclude_modules = list(
set(hf_exclude_modules + default_exclude))
else:
quant_config.exclude_modules = default_exclude
# MXFP4 checkpoints.
elif hf_quant_config.get("quant_method") == "mxfp4":
quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo(
moe_backend)
quant_config.group_size = 32
quant_config.exclude_modules = [
# Default exclude_modules for MXFP4 (TRTLLM internal format)
default_exclude = [
'block.*.attn.out', 'block.*.mlp.gate', 'block.*.attn.qkv',
'embedding', 'unembedding'
]
# Merge HF config's modules_to_not_convert with default exclude_modules
if hf_exclude_modules is not None:
quant_config.exclude_modules = list(
set(hf_exclude_modules + default_exclude))
else:
quant_config.exclude_modules = default_exclude
return quant_config, layer_quant_config
@staticmethod

View File

@ -5,6 +5,7 @@ from .modeling_bert import BertForSequenceClassification
from .modeling_clip import CLIPVisionModel
from .modeling_deepseekv3 import DeepseekV3ForCausalLM
from .modeling_exaone4 import Exaone4ForCausalLM
from .modeling_exaone_moe import ExaoneMoeForCausalLM
from .modeling_gemma3 import Gemma3ForCausalLM
from .modeling_gemma3vl import Gemma3VLM
from .modeling_glm import Glm4MoeForCausalLM
@ -44,6 +45,7 @@ __all__ = [
"CLIPVisionModel",
"DeepseekV3ForCausalLM",
"Exaone4ForCausalLM",
"ExaoneMoeForCausalLM",
"Gemma3ForCausalLM",
"Gemma3VLM",
"HCXVisionForCausalLM",

View File

@ -0,0 +1,581 @@
import math
import os
import re
from typing import Dict, List, Optional, Tuple
import torch
from torch import nn
from tensorrt_llm._ipc_utils import can_access_peer
from tensorrt_llm._torch.modules.qk_norm_attention import QKNormRoPEAttention
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo
from ...logger import logger
from ..attention_backend import AttentionMetadata
from ..attention_backend.interface import (
PositionalEmbeddingParams,
PredefinedAttentionMask,
RopeParams,
)
from ..distributed import (
AllReduce,
AllReduceFusionOp,
AllReduceParams,
MoEAllReduce,
MoEAllReduceParams,
)
from ..model_config import ModelConfig
from ..models.modeling_deepseekv3 import Deepseekv3MoE
from ..modules.decoder_layer import DecoderLayer
from ..modules.embedding import Embedding
from ..modules.gated_mlp import GatedMLP
from ..modules.linear import TensorParallelMode
from ..modules.rms_norm import RMSNorm
from ..utils import AuxStreamType, Fp4QuantizedTensor
from .modeling_utils import (
DecoderModel,
DecoderModelForCausalLM,
EagerFusionConfig,
register_auto_model,
)
# fmt: off
# TODO: Remove this once we have a proper transformers package
from transformers import AutoConfig, PretrainedConfig # isort: skip
class ExaoneMoEConfig(PretrainedConfig):
model_type = "exaone_moe"
logger.warning_once(
"transformers does not support 'ExaoneMoEConfig'. "
"Register ExaoneMoEConfig to mimic the ExaoneMoE model.",
key="EXAONE_MOE_REGISTER_WARNING"
)
AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig)
# End of the config register.
# fmt: on
def check_is_moe(config: ExaoneMoEConfig, layer_idx: int) -> bool:
"""
Check if the current layer is a MoE layer.
"""
return hasattr(config, "is_moe_layer") and config.is_moe_layer[layer_idx]
def enable_attn_allreduce(mapping: Mapping):
return not mapping.enable_attention_dp or mapping.has_tp()
class ExaoneMoeAttention(QKNormRoPEAttention):
def __init__(
self,
model_config: ModelConfig[ExaoneMoEConfig],
layer_idx: Optional[int] = None,
fuse_qk_norm_rope: bool = False,
disable_deep_gemm: bool = False,
):
config = model_config.pretrained_config
self.attention_window_size = None
self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
# NOTE: In ExaoneMoe, only sliding layers apply rope.
pos_embd_params = None
if self.is_sliding:
self.attention_window_size = config.sliding_window
pos_embd_params = PositionalEmbeddingParams(
type=PositionEmbeddingType.rope_gpt_neox,
rope=RopeParams.from_config(config),
)
fuse_qk_norm_rope = self.is_sliding and fuse_qk_norm_rope
# NOTE: Fusing qk norm with rope has an issue that slightly hurts accuracy.
assert not fuse_qk_norm_rope, "Fusing qk norm and rope is having issue now"
super().__init__(
hidden_size=config.hidden_size,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
max_position_embeddings=config.max_position_embeddings,
bias=False,
pos_embd_params=pos_embd_params,
fuse_qk_norm_rope=fuse_qk_norm_rope,
skip_rope=not self.is_sliding,
layer_idx=layer_idx,
dtype=config.torch_dtype,
config=model_config,
disable_deep_gemm=disable_deep_gemm,
reduce_output=enable_attn_allreduce(model_config.mapping),
)
def forward(
self,
position_ids: Optional[torch.LongTensor],
hidden_states: torch.Tensor,
attn_metadata: AttentionMetadata,
attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.CAUSAL,
lora_params: Optional[dict] = None,
**kwargs,
) -> torch.Tensor:
return super().forward(
position_ids=position_ids,
hidden_states=hidden_states,
attn_metadata=attn_metadata,
attention_mask=attention_mask,
lora_params=lora_params,
attention_window_size=self.attention_window_size,
**kwargs,
)
class ExaoneMoeSparseMoEBlock(Deepseekv3MoE):
"""
ExaoneMoe Sparse MoE Block Layer.
It follows DeepSeek-V3 implementation.
"""
class ExaoneMoeDecoderLayer(DecoderLayer):
def __init__(
self,
model_config: ModelConfig[ExaoneMoEConfig],
aux_stream_dict: Dict[AuxStreamType, torch.cuda.Stream],
layer_idx: int,
):
super().__init__()
self.model_config = model_config
config = model_config.pretrained_config
self.layer_idx = layer_idx
self.mapping = model_config.mapping
mapping = self.mapping
self.enable_attention_dp = mapping.enable_attention_dp
self.mlp_tp_size = mapping.tp_size
self.is_p2p_supported = can_access_peer(mapping)
self.fusion_config = EagerFusionConfig()
# MoE fusions are disabled by default in K-EXAONE since
# it may cause a slight accuracy drop due to numerical gap.
self.enable_fusion = os.environ.get("TRTLLM_EXAONE_EAGER_FUSION_ENABLED", "0") == "1"
self.enable_fusion &= not self.enable_attention_dp
# FIXME: incompatible with mixed quantization mode
quant_config = self._get_decoder_layer_quant_config(model_config, layer_idx)
self.is_nvfp4 = quant_config.layer_quant_mode.has_nvfp4()
assert quant_config.quant_algo is not QuantAlgo.MIXED_PRECISION, (
"MIXED_PRECISION is ambiguous"
)
self.allreduce = None
self.moe_allreduce = None
if not self.enable_attention_dp and self.mapping.tp_size > 1:
self.allreduce = AllReduce(
mapping=model_config.mapping,
strategy=model_config.allreduce_strategy,
dtype=config.torch_dtype,
)
self.moe_allreduce = MoEAllReduce(self.mapping)
has_tp = mapping.has_tp()
has_pp = mapping.has_pp()
# Submodule definitions
self.input_layernorm = RMSNorm(
hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
)
self.self_attn = ExaoneMoeAttention(model_config, layer_idx=layer_idx)
# MoE or Dense layer
self.is_moe_layer = check_is_moe(config, layer_idx)
if self.is_moe_layer:
self.fusion_config.PRE_MOE_FUSION = self.enable_fusion and has_tp
self.fusion_config.POST_MOE_FUSION = self.fusion_config.PRE_MOE_FUSION and not has_pp
self.mlp = ExaoneMoeSparseMoEBlock(
num_experts=config.num_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
intermediate_size=config.moe_intermediate_size,
shared_expert_intermediate_size=config.moe_intermediate_size
* config.num_shared_experts,
dtype=config.torch_dtype,
model_config=model_config,
override_quant_config=quant_config,
aux_stream_dict=aux_stream_dict,
layer_idx=layer_idx,
)
else:
block_size = 1
if quant_config.quant_algo is None and quant_config.group_size is not None:
block_size = quant_config.group_size
self.mlp_tp_size = self._compute_mlp_tp_size(config.intermediate_size, block_size)
has_mlp_tp = self.mlp_tp_size > 1
self.fusion_config.PRE_MLP_FUSION = self.enable_fusion and has_mlp_tp and self.is_nvfp4
self.fusion_config.POST_MLP_FUSION = self.enable_fusion and has_mlp_tp
self.mlp = GatedMLP(
hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
bias=False,
dtype=config.torch_dtype,
config=model_config,
# Keep sharding consistent with computed mlp_tp_size.
# In attention-DP, mlp_tp_size==1 -> disable TP sharding here.
overridden_tp_size=self.mlp_tp_size,
layer_idx=layer_idx,
reduce_output=has_mlp_tp,
)
self.disable_attn_allreduce = (
self.fusion_config.PRE_MOE_FUSION
or self.fusion_config.PRE_MLP_FUSION
or self.mapping.tp_size == 1
or self.enable_attention_dp
)
self.post_attention_layernorm = RMSNorm(
hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
)
self.next_layer_layernorm: RMSNorm = None
def _get_decoder_layer_quant_config(
self, model_config: ModelConfig[ExaoneMoEConfig], layer_idx: int
):
"""
The MTP layer in the nvfp4 checkpoint is unquantized. Because the TRTLLM
moe_backend only supports fp8/fp4 quantization, we need to override
the quant_config for the MTP layer.
"""
quant_config = model_config.quant_config
layer_name = f"model.layers.{layer_idx}"
if quant_config.is_module_excluded_from_quantization(layer_name):
return QuantConfig(
quant_algo=None,
kv_cache_quant_algo=quant_config.kv_cache_quant_algo,
)
else:
return model_config.quant_config
def _compute_mlp_tp_size(self, intermediate_size: int, block_size: int) -> int:
"""Adopted from DeepseekV3DecoderLayer._compute_mlp_tp_size."""
assert intermediate_size % block_size == 0, (
f"intermediate_size {intermediate_size} must be divisible by block_size {block_size}."
)
if self.enable_attention_dp:
# If using attention DP, the MLP also uses DP instead of TP.
mlp_tp_size = 1
else:
# The two math.gcd operations ensure that mlp_tp_size falls in the candidate TP sizes.
tp = math.gcd(
intermediate_size // block_size,
self.mapping.tp_size,
)
if tp > self.mapping.gpus_per_node:
mlp_tp_size = math.gcd(
tp,
self.mapping.gpus_per_node,
) # Avoid costly inter-node TP
else:
mlp_tp_size = tp
return mlp_tp_size
def forward(
self,
position_ids: torch.LongTensor,
hidden_states: torch.Tensor,
attn_metadata: AttentionMetadata,
residual: Optional[torch.Tensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
# LN has neem already applied at the previous layer except the first layer.
if residual is None:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
hidden_states = self.self_attn(
position_ids=position_ids,
hidden_states=hidden_states,
attn_metadata=attn_metadata,
all_reduce_params=AllReduceParams(enable_allreduce=not (self.disable_attn_allreduce)),
**kwargs,
)
if self.is_moe_layer:
hidden_states, residual = self.forward_moe(
hidden_states=hidden_states,
attn_metadata=attn_metadata,
residual=residual,
)
else:
hidden_states, residual = self.forward_mlp(
hidden_states=hidden_states,
residual=residual,
)
return hidden_states, residual
def forward_moe(
self,
hidden_states: torch.Tensor,
attn_metadata: AttentionMetadata,
residual: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
def _run_moe(hidden_states, hidden_states_fp4, do_finalize):
return self.mlp(
hidden_states,
hidden_states_fp4,
all_rank_num_tokens=attn_metadata.all_rank_num_tokens,
final_all_reduce_params=AllReduceParams(
enable_allreduce=not (
self.fusion_config.POST_MOE_FUSION or self.mapping.tp_size == 1
)
),
do_finalize=do_finalize,
)
if self.fusion_config.PRE_MOE_FUSION:
# moe_backend can be either CUTLASS or TRTLLM here
hidden_states, residual = self.allreduce(
hidden_states,
all_reduce_params=AllReduceParams(
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
residual=residual,
norm_weight=self.post_attention_layernorm.weight,
eps=self.post_attention_layernorm.variance_epsilon,
trigger_completion_at_end=False,
),
)
else:
# No fusion
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
# Note: this fusion pattern is only supported for single-node TRTLLM-nvfp4 backend now
do_finalize = self.mapping.is_multi_node() or (
not (
self.fusion_config.POST_MOE_FUSION
and hidden_states.shape[0] <= self.moe_allreduce.max_token
and self.model_config.moe_backend == "TRTLLM"
and self.mlp.experts.has_nvfp4
and self.is_p2p_supported
)
)
hidden_states = _run_moe(hidden_states, hidden_states_fp4=None, do_finalize=do_finalize)
if self.fusion_config.POST_MOE_FUSION:
if do_finalize:
hidden_states, residual = self.allreduce(
hidden_states,
all_reduce_params=AllReduceParams(
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
residual=residual,
norm_weight=self.next_layer_layernorm.weight,
eps=self.next_layer_layernorm.variance_epsilon,
trigger_completion_at_end=False,
),
)
else:
assert len(hidden_states) == 4, "hidden_states must have 4 elements"
shared_output = hidden_states[0]
fc2_output = hidden_states[1]
expert_scale_factor = hidden_states[2]
expanded_idx_to_permuted_idx = hidden_states[3]
moe_all_reduce_params = MoEAllReduceParams(
expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
expert_scale_factor=expert_scale_factor,
shared_expert_output=shared_output,
residual=residual,
norm_weight=self.next_layer_layernorm.weight,
eps=self.next_layer_layernorm.variance_epsilon,
is_cutlass_min_latency=False,
)
hidden_states, residual = self.moe_allreduce(
fc2_output, all_reduce_params=moe_all_reduce_params
)
elif self.next_layer_layernorm is not None:
hidden_states, residual = self.next_layer_layernorm(hidden_states, residual)
return hidden_states, residual
def forward_mlp(
self,
hidden_states: torch.Tensor,
residual: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
if self.fusion_config.PRE_MLP_FUSION:
act_fp4, act_sf, residual = self.allreduce(
hidden_states,
all_reduce_params=AllReduceParams(
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4,
residual=residual,
norm_weight=self.post_attention_layernorm.weight,
scale=self.mlp.gate_up_proj.input_scale,
eps=self.post_attention_layernorm.variance_epsilon,
),
)
hidden_states = Fp4QuantizedTensor(act_fp4, act_sf)
else:
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
hidden_states = self.mlp(
hidden_states,
final_all_reduce_params=AllReduceParams(
enable_allreduce=not (self.fusion_config.POST_MLP_FUSION or self.mlp_tp_size == 1)
),
)
if self.fusion_config.POST_MLP_FUSION:
hidden_states, residual = self.allreduce(
hidden_states,
all_reduce_params=AllReduceParams(
fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
residual=residual,
norm_weight=self.next_layer_layernorm.weight,
eps=self.next_layer_layernorm.variance_epsilon,
),
)
elif self.next_layer_layernorm is not None:
hidden_states, residual = self.next_layer_layernorm(hidden_states, residual)
return hidden_states, residual
class ExaoneMoeModel(DecoderModel):
def __init__(self, model_config: ModelConfig[ExaoneMoEConfig]):
super().__init__(model_config)
config = self.model_config.pretrained_config
self.num_hidden_layers = config.num_hidden_layers
self.embed_tokens = Embedding(
config.vocab_size,
config.hidden_size,
dtype=config.torch_dtype,
mapping=model_config.mapping,
tensor_parallel_mode=TensorParallelMode.COLUMN,
gather_output=True,
)
aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
self.aux_stream_dict = {
AuxStreamType.Attention: aux_stream_list[0],
AuxStreamType.MoeShared: aux_stream_list[0],
AuxStreamType.MoeChunkingOverlap: aux_stream_list[1],
AuxStreamType.MoeBalancer: aux_stream_list[2],
}
self.layers = nn.ModuleList(
[
ExaoneMoeDecoderLayer(
model_config=model_config,
aux_stream_dict=self.aux_stream_dict,
layer_idx=layer_idx,
)
for layer_idx in range(self.num_hidden_layers)
]
)
self.norm = RMSNorm(
hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
)
def forward(
self,
attn_metadata: AttentionMetadata,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
lora_params=None,
**kwargs,
) -> torch.Tensor | Tuple[torch.Tensor, Optional[torch.Tensor]]:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at "
"the same time, and must specify either one."
)
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
hidden_states = inputs_embeds.to(self.dtype)
residual = None
for decoder_layer in self.layers[: self.num_hidden_layers]:
hidden_states, residual = decoder_layer(
position_ids=position_ids,
hidden_states=hidden_states,
attn_metadata=attn_metadata,
residual=residual,
lora_params=lora_params,
)
# The last LN already has been applied as a part of fusion.
return hidden_states
@register_auto_model("ExaoneMoEForCausalLM")
class ExaoneMoeForCausalLM(DecoderModelForCausalLM[ExaoneMoeModel, ExaoneMoEConfig]):
def __init__(
self,
model_config: ModelConfig[ExaoneMoEConfig],
):
super().__init__(
ExaoneMoeModel(model_config),
config=model_config,
hidden_size=model_config.pretrained_config.hidden_size,
vocab_size=model_config.pretrained_config.vocab_size,
)
def load_weights(
self,
weights: Dict,
weight_mapper: Optional["BaseWeightMapper"] = None, # noqa: F821
skip_modules: Optional[List[str]] = None,
allow_partial_loading: bool = False,
):
# MoE naming pattern.
moe_weight_patterns = {
"gate_proj": "w1",
"up_proj": "w3",
"down_proj": "w2",
}
module_names = list(weights)
for name in module_names:
if "mlp.e_score_correction_bias" in name:
# Move bias into the gate module.
new_name = name.replace(
"mlp.e_score_correction_bias", "mlp.gate.e_score_correction_bias"
)
else:
# MoE Weight Remapping.
new_name = name
for k, v in moe_weight_patterns.items():
pattern = rf"(experts\.\d+\.){k}\b"
new_name = re.sub(pattern, rf"\1{v}", new_name)
# Remap the name-parameter pair if needed.
if new_name != name:
weights[new_name] = weights.pop(name)
super().load_weights(
weights=weights,
weight_mapper=weight_mapper,
skip_modules=skip_modules or [],
allow_partial_loading=allow_partial_loading,
)
def post_load_weights(self):
# For the cross-layer residual+LN fusion.
for idx, layer in enumerate(self.model.layers[: self.config.num_hidden_layers]):
if idx == self.config.num_hidden_layers - 1:
layer.next_layer_layernorm = self.model.norm
else:
layer.next_layer_layernorm = self.model.layers[idx + 1].input_layernorm

View File

@ -32,6 +32,7 @@ from typing import Dict, List, Optional, Tuple, Union
import torch
from tensorrt_llm._torch.expert_statistic import ExpertStatistic
from tensorrt_llm._torch.model_config import ModelConfig
from tensorrt_llm._torch.modules.fused_moe.interface import MoE
from tensorrt_llm._torch.modules.fused_moe.routing import BaseMoeRoutingMethod
@ -619,6 +620,10 @@ class ConfigurableMoE(MoE):
else:
token_selected_slots = token_selected_experts
if token_selected_slots is not None:
ExpertStatistic.set_layer(self.layer_idx)
ExpertStatistic.maybe_add_info(self.num_slots, token_selected_slots)
# ========== Step 3.5: Communication Prepare Phase (BEFORE quantization) ==========
# NVLINK two-sided has a prepare phase to gather EPLB statistics
@ -647,6 +652,10 @@ class ConfigurableMoE(MoE):
# supports_post_quant_dispatch checks strategy capability for the current quant mode
supports_post_quant = self.comm.supports_post_quant_dispatch()
# Call dummy_allreduce before allgather for load balancing debug
if self.enable_dummy_allreduce:
self.dummy_allreduce()
if supports_post_quant:
# ===== Post-quant flow: Quantize → Dispatch =====
@ -710,6 +719,8 @@ class ConfigurableMoE(MoE):
# ========== Step 9: Communication - Combine ==========
if self.comm is not None:
if self.enable_dummy_allreduce:
self.dummy_allreduce()
# Use unified combine interface (reads dispatch state from strategy)
final_hidden_states = self.comm.combine(final_hidden_states)
else:

View File

@ -159,10 +159,6 @@ class WideEPMoE(MoE):
if not model_config.skip_create_weights_in_init:
self.create_weights()
# Debug function for eliminating imbalance during performance analysis.
self.enable_dummy_allreduce = os.environ.get(
"TRTLLM_ENABLE_DUMMY_ALLREDUCE", "0") == "1"
# MoE op will be lazily initialized when first accessed (see moe_op_impl property)
self._moe_op_impl = None
@ -342,16 +338,6 @@ class WideEPMoE(MoE):
self._moe_op_impl = MoEOpSelector.select_op(self)
return self._moe_op_impl
def dummy_allreduce(self):
"""
Debug function for eliminating imbalance during performance analysis.
Creates a small dummy tensor and performs allreduce to synchronize processes
and eliminate timing imbalances for more accurate profiling measurements.
"""
dummy_tensor = torch.zeros(4, dtype=torch.float32, device='cuda')
dummy_tensor = self.all_reduce(dummy_tensor)
return dummy_tensor
def reducescatter_or_allreduce(
self,
inputs,

View File

@ -1,3 +1,4 @@
import os
import weakref
from abc import abstractmethod
from enum import Enum, IntEnum
@ -200,11 +201,19 @@ class MoE(nn.Module):
self.intermediate_size_per_partition = intermediate_size // self.tp_size
self.all_reduce = None
# Debug function for eliminating imbalance during performance analysis.
self.enable_dummy_allreduce = os.environ.get(
"TRTLLM_ENABLE_DUMMY_ALLREDUCE", "0") == "1"
if not self.use_dp and self.mapping.tp_size > 1:
self.all_reduce = AllReduce(
mapping=self.mapping,
strategy=model_config.allreduce_strategy,
dtype=self.dtype)
elif self.enable_dummy_allreduce:
from tensorrt_llm.functional import AllReduceStrategy
self.all_reduce = AllReduce(mapping=self.mapping,
strategy=AllReduceStrategy.NCCL,
dtype=self.dtype)
# Initialize load balancer related attributes
if init_load_balancer:
@ -748,3 +757,14 @@ class MoE(nn.Module):
elif self.reduce_results:
outputs = self.all_reduce(inputs)
return outputs
def dummy_allreduce(self):
assert self.enable_dummy_allreduce and self.all_reduce is not None, "Dummy allreduce is not enabled"
"""
Debug function for eliminating imbalance during performance analysis.
Creates a small dummy tensor and performs allreduce to synchronize processes
and eliminate timing imbalances for more accurate profiling measurements.
"""
dummy_tensor = torch.zeros(4, dtype=torch.float32, device="cuda")
dummy_tensor = self.all_reduce(dummy_tensor)
return dummy_tensor

View File

@ -257,21 +257,33 @@ class Deepseekv3RoutingImpl:
if self.n_group > 1:
if self.top_k > 8 or (num_experts / n_group) > 32 or (
num_experts / n_group) * self.topk_group > 128:
if (self.is_fused):
if self.is_fused:
warnings.warn(
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
)
self.is_fused = False
else:
elif (num_experts > 512 or (self.top_k > 8 and self.top_k != 22)
or self.topk_group == 1):
# We have special implementation for n_group == 1, top_k == 22 and num_experts == 512 for Nemotron Super v3.
if num_experts > 512 or (self.top_k > 8 and self.top_k != 22):
if (self.is_fused):
warnings.warn(
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
)
self.is_fused = False
if self.is_fused:
warnings.warn(
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
)
self.is_fused = False
if not self.is_fused:
if self.n_group == 1 and self.topk_group == 1:
scores, scores_with_bias = self.get_scores(logits,
e_score_correction_bias)
_, topk_indices = torch.topk(scores_with_bias, k=self.top_k, dim=1)
topk_values = torch.gather(scores, dim=1,
index=topk_indices).type_as(scores)
# Normalize and scale.
topk_values_sum = torch.sum(topk_values, dim=-1,
keepdim=True) + 1e-20
topk_values = topk_values / topk_values_sum * self.routed_scaling_factor
return topk_values, topk_indices
elif not self.is_fused:
scores, scores_with_bias = self.get_scores(logits,
e_score_correction_bias)
scores_shape = list(scores_with_bias.shape)

View File

@ -1167,7 +1167,8 @@ class PyExecutor:
for req in previous_batch.scheduled_ctx_reqs:
if req.is_context_only_request and (
req.is_context_finished
or req.is_finished_due_to_length):
or req.is_finished_due_to_length
) and not req.is_finished_due_to_cancellation:
block_id = self.kv_cache_manager.store_blocks_for_reuse(
req, True)
self.ctx_in_transmission_requests[
@ -1436,7 +1437,8 @@ class PyExecutor:
for req in scheduled_batch.context_requests:
if req.is_context_only_request and (
req.is_context_finished
or req.is_finished_due_to_length):
or req.is_finished_due_to_length
) and not req.is_finished_due_to_cancellation:
block_id = self.kv_cache_manager.store_blocks_for_reuse(
req, True)
self.ctx_in_transmission_requests[
@ -1686,7 +1688,8 @@ class PyExecutor:
for req in self.previous_batch.sample_state.scheduled_requests.context_requests:
if req.is_context_only_request and (
req.is_context_finished
or req.is_finished_due_to_length):
or req.is_finished_due_to_length
) and not req.is_finished_due_to_cancellation:
block_id = self.kv_cache_manager.store_blocks_for_reuse(
req, True)
self.ctx_in_transmission_requests[
@ -2196,8 +2199,9 @@ class PyExecutor:
if (scheduled_ctx_requests is None or len(scheduled_ctx_requests) == 0):
return []
for req in scheduled_ctx_requests:
if req.is_context_only_request and (req.is_context_finished or
req.is_finished_due_to_length):
if req.is_context_only_request and (
req.is_context_finished or req.is_finished_due_to_length
) and not req.is_finished_due_to_cancellation:
self.kv_cache_transceiver.respond_and_send_async(req)
for resource_mgr_type in (
ResourceManagerType.SEQ_SLOT_MANAGER,

View File

@ -1431,7 +1431,8 @@ class ResourceManager:
resource_manager.update_resources(scheduled_batch)
def free_resources(self, request: LlmRequest):
for _, resource_manager in reversed(self.resource_managers.items()):
for resource_type, resource_manager in reversed(
self.resource_managers.items()):
if hasattr(resource_manager, "free_resources"):
resource_manager.free_resources(request)

View File

@ -560,7 +560,7 @@ class ReportUtility:
else:
backend_info = (
"\n\n===========================================================\n"
"= PYTORCH BACKEND\n"
f"= {self.rt_cfg.backend.upper()} BACKEND\n"
"===========================================================\n"
f"Model:\t\t\t{engine['model']}\n"
f"Model Path:\t\t{engine['model_path']}\n"

View File

@ -207,7 +207,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
kwargs = self.get_default_kwargs()
# TODO: multi-stream MOE seems to increase the memory usage
kwargs["max_batch_size"] = 32
kwargs["free_mem_ratio"] = 0.5
kwargs["free_mem_ratio"] = 0.4
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
tokenizer=self.MODEL_PATH_BF16,
@ -226,9 +226,9 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
# Manually set quant_config for FP8 model to get the accuracy threshold
llm.args.quant_config.quant_algo = QuantAlgo.FP8
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
# task = MMLU(self.MODEL_NAME)
# task.evaluate(llm, sampling_params=sampling_params)
sampling_params = self.get_default_sampling_params()
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=sampling_params)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

View File

@ -260,6 +260,7 @@ class TestQwen3VL_MOE(LlmapiAccuracyTestHarness):
max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<|endoftext|>"
)
@pytest.mark.skip_less_device_memory(140000)
def test_auto_dtype(self):
with LLM(
self.MODEL_PATH,

View File

@ -0,0 +1,44 @@
hostname: localhost
port: 8000
model: DeepSeek-V3-Lite/bf16
backend: "pytorch"
enable_autotuner: False
context_servers:
disable_overlap_scheduler: True
num_instances: 1
tensor_parallel_size: 1
pipeline_parallel_size: 1
max_num_tokens: 16384
max_seq_len: 32768
enable_chunked_prefill: True
kv_cache_config:
enable_block_reuse: True
enable_partial_reuse: True
free_gpu_memory_fraction: 0.3
cache_transceiver_config:
backend: "DEFAULT"
max_tokens_in_buffer: 32768
cuda_graph_config:
enable_padding: True
max_batch_size: 1
urls:
- "localhost:8001"
generation_servers:
num_instances: 1
tensor_parallel_size: 1
pipeline_parallel_size: 1
max_num_tokens: 2048
max_seq_len: 32768
enable_chunked_prefill: True
kv_cache_config:
enable_block_reuse: True
enable_partial_reuse: True
free_gpu_memory_fraction: 0.85
cache_transceiver_config:
backend: "DEFAULT"
max_tokens_in_buffer: 32768
cuda_graph_config:
enable_padding: True
max_batch_size: 64
urls:
- "localhost:8002"

View File

@ -0,0 +1,44 @@
hostname: localhost
port: 8000
model: DeepSeek-V3-0324-FP4
backend: "pytorch"
enable_autotuner: False
context_servers:
disable_overlap_scheduler: True
num_instances: 1
tensor_parallel_size: 4
pipeline_parallel_size: 1
max_num_tokens: 12000
max_seq_len: 262144
enable_chunked_prefill: True
kv_cache_config:
enable_block_reuse: True
enable_partial_reuse: True
free_gpu_memory_fraction: 0.2
cache_transceiver_config:
backend: "DEFAULT"
max_tokens_in_buffer: 262144
cuda_graph_config:
enable_padding: True
max_batch_size: 1
urls:
- "localhost:8001"
generation_servers:
num_instances: 1
tensor_parallel_size: 4
pipeline_parallel_size: 1
max_num_tokens: 2048
max_seq_len: 262144
enable_chunked_prefill: True
kv_cache_config:
enable_block_reuse: True
enable_partial_reuse: True
free_gpu_memory_fraction: 0.3
cache_transceiver_config:
backend: "DEFAULT"
max_tokens_in_buffer: 262144
cuda_graph_config:
enable_padding: True
max_batch_size: 11
urls:
- "localhost:8002"

View File

@ -200,6 +200,10 @@ def get_test_config(test_desc, example_dir, test_root):
"gpt_oss_120b_stress":
(4,
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml"),
"cancel_stress_test":
(2, f"{test_configs_root}/disagg_config_cancel_stress_test.yaml"),
"cancel_stress_test_large":
(8, f"{test_configs_root}/disagg_config_cancel_stress_test_large.yaml"),
}
if test_desc not in config_map:
@ -2098,3 +2102,211 @@ def test_disaggregated_stress_test(disaggregated_test_root,
threshold=test_config.accuracy_threshold,
env=llm_venv._new_env,
cwd=llm_venv.get_working_directory())
def run_cancel_stress_test(server_url: str,
num_bursts: int = 5,
requests_per_burst: int = 32,
prompt_len_range: tuple = (2000, 8000),
cancel_after_range: tuple = (0.01, 0.1)):
"""
Stress test that sends requests with large contexts and cancels them
during prefill to test resource cleanup under cancellation.
Args:
server_url: The server URL (e.g., "http://localhost:8000")
num_bursts: Number of request bursts to send
requests_per_burst: Number of concurrent requests per burst
prompt_len_range: (min, max) prompt length in tokens
cancel_after_range: (min, max) seconds to wait before cancelling
"""
import asyncio
import random
import time
import aiohttp
async def spam_and_cancel(session, req_id, url, prompt_len_range,
cancel_after_range):
"""Send a request and cancel it during prefill."""
prompt_len = random.randint(prompt_len_range[0], prompt_len_range[1])
prompt = "test " * (prompt_len // 5)
payload = {
"model": "test-model",
"prompt": prompt,
"max_tokens": 10,
"stream": True
}
try:
cancel_after = random.uniform(cancel_after_range[0],
cancel_after_range[1])
start = time.time()
async with session.post(
f"{url}/v1/completions",
json=payload,
timeout=aiohttp.ClientTimeout(total=60)) as resp:
async for line in resp.content:
if time.time() - start > cancel_after:
# Force disconnect during prefill
break
except Exception:
pass # Connection abort is expected
async def run_bursts():
async with aiohttp.ClientSession() as session:
for burst_idx in range(num_bursts):
tasks = [
spam_and_cancel(session, i, server_url, prompt_len_range,
cancel_after_range)
for i in range(requests_per_burst)
]
await asyncio.gather(*tasks)
logger.info(
f"Completed burst {burst_idx + 1}/{num_bursts} ({requests_per_burst} requests)"
)
await asyncio.sleep(0.05)
asyncio.run(run_bursts())
def run_disaggregated_cancel_test(example_dir,
test_desc,
env=None,
cwd=None,
num_bursts=64,
requests_per_burst=64):
"""Run disaggregated test with request cancellation stress test."""
cleanup_output_files()
run_env = env.copy()
run_env["UCX_TLS"] = "^ib"
num_ranks, config_file = get_test_config(test_desc, example_dir,
os.path.dirname(__file__))
workers_cmd = [
'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
str(num_ranks), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
config_file
]
server_start_timeout = 1200
server_cmd = [
'trtllm-serve', 'disaggregated', '--server_start_timeout',
str(server_start_timeout), '-c', config_file
]
server_host, server_port = get_disagg_server_url_from_cfg(config_file)
server_url = f"http://{server_host}:{server_port}"
try:
with (open('output_workers.log', 'w') as output_workers,
popen(workers_cmd,
stdout=output_workers,
stderr=subprocess.STDOUT,
env=run_env,
cwd=cwd) as workers_proc, open('output_disagg.log', 'w') as
output_disagg,
popen(server_cmd,
stdout=output_disagg,
stderr=subprocess.STDOUT,
env=run_env,
cwd=cwd) as server_proc):
# Wait for server to be ready
if not wait_for_server(server_host,
server_port,
timeout_seconds=server_start_timeout):
raise RuntimeError(
f"Disaggregated server did not become ready within {server_start_timeout} seconds"
)
# Run the cancel stress test
run_cancel_stress_test(server_url,
num_bursts=num_bursts,
requests_per_burst=requests_per_burst)
# Verify server is still healthy after stress test by sending a normal request
client_dir = f"{example_dir}/clients"
client_cmd = [
'python3', f'{client_dir}/disagg_client.py', '-c', config_file,
'-p', f'{client_dir}/prompts.json', '--ignore-eos',
'--server-start-timeout',
str(server_start_timeout)
]
check_call(client_cmd,
env=env,
poll_procs=[workers_proc, server_proc])
except Exception:
logger.error("-------- Workers output --------")
with open('output_workers.log', 'r') as f:
logger.error(f.read())
logger.error("-------- Disagg server output --------")
with open('output_disagg.log', 'r') as f:
logger.error(f.read())
raise
finally:
if 'server_proc' in locals() and 'workers_proc' in locals():
server_proc.terminate()
workers_proc.terminate()
server_proc.wait()
workers_proc.wait()
@pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-bf16'],
indirect=True)
def test_disaggregated_cancel_large_context_requests(disaggregated_test_root,
disaggregated_example_root,
llm_venv,
deepseek_v3_model_root):
"""
Test that the disaggregated server handles request cancellations gracefully.
This test sends bursts of requests with large contexts and cancels them
during prefill to stress test resource cleanup.
"""
src_dst_dict = {
deepseek_v3_model_root:
f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/bf16",
}
for src, dst in src_dst_dict.items():
if not os.path.islink(dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
os.symlink(src, dst, target_is_directory=True)
run_disaggregated_cancel_test(disaggregated_example_root,
"cancel_stress_test",
env=llm_venv._new_env,
cwd=llm_venv.get_working_directory(),
num_bursts=5,
requests_per_burst=32)
@pytest.mark.skip_less_device(8)
@skip_pre_blackwell
@pytest.mark.parametrize("model_path", ['DeepSeek-V3-0324-FP4'])
def test_disaggregated_cancel_large_context_requests_long(
disaggregated_test_root, disaggregated_example_root, llm_venv,
model_path):
"""Test that disaggregated server handles request cancellations gracefully.
This test sends bursts of requests with large contexts and cancels them
during prefill to stress test resource cleanup.
"""
model_dir = f"{llm_models_root()}/{model_path}"
src_dst_dict = {
model_dir: f"{llm_venv.get_working_directory()}/{model_path}",
}
for src, dst in src_dst_dict.items():
if not os.path.islink(dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
os.symlink(src, dst, target_is_directory=True)
run_disaggregated_cancel_test(disaggregated_example_root,
"cancel_stress_test_large",
env=llm_venv._new_env,
cwd=llm_venv.get_working_directory(),
num_bursts=1000,
requests_per_burst=32)

View File

@ -3,15 +3,7 @@ network_name,perf_case_name,test_name,threshold,absolute_threshold,metric_type,p
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,76.45,
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,9785.75,
"llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,55.64,
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,171845.02,H100_PCIe
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.17,H100_PCIe
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,48.09,H100_PCIe
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,6155.59,H100_PCIe
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,139897.82,H100_NVL
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,69.59,H100_NVL
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,58.63,H100_NVL
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,7504.07,H100_NVL
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,5000,INFERENCE_TIME,125068.76,H100
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",0.20,2,KV_CACHE_SIZE,57.09,H100
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,5,SEQ_THROUGHPUT,65.50,H100
"llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192","H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]","test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192]",-0.20,500,TOKEN_THROUGHPUT,8384.00,H100
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",0.1,50,INFERENCE_TIME,1359184.5059,H100_PCIe
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,50,KV_CACHE_SIZE,10.92,H100_PCIe
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,10,SEQ_THROUGHPUT,0.3767,H100_PCIe
"deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024","H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]","test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024]",-0.1,10,TOKEN_THROUGHPUT,385.7372,H100_PCIe

1 network_name perf_case_name test_name threshold absolute_threshold metric_type perf_metric device_subtype
3 llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] -0.20 5 SEQ_THROUGHPUT 76.45
4 llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_token_throughput[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] -0.20 500 TOKEN_THROUGHPUT 9785.75
5 llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-bench-pytorch-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] 0.20 2 KV_CACHE_SIZE 55.64
6 llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_inference_time[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] 0.20 0.1 5000 50 INFERENCE_TIME 171845.02 1359184.5059 H100_PCIe
7 llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_kv_cache_size[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] 0.20 -0.1 2 50 KV_CACHE_SIZE 57.17 10.92 H100_PCIe
8 llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_seq_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] -0.20 -0.1 5 10 SEQ_THROUGHPUT 48.09 0.3767 H100_PCIe
9 llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024 H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] H100_PCIe-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_token_throughput[deepseek_r1_distill_qwen_32b-subtype:H100_PCIe-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:1024,1024] -0.20 -0.1 500 10 TOKEN_THROUGHPUT 6155.59 385.7372 H100_PCIe
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] 0.20 5000 INFERENCE_TIME 139897.82 H100_NVL
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] 0.20 2 KV_CACHE_SIZE 69.59 H100_NVL
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] -0.20 5 SEQ_THROUGHPUT 58.63 H100_NVL
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100_NVL-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100_NVL-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] -0.20 500 TOKEN_THROUGHPUT 7504.07 H100_NVL
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_inference_time[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] 0.20 5000 INFERENCE_TIME 125068.76 H100
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_kv_cache_size[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] 0.20 2 KV_CACHE_SIZE 57.09 H100
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_seq_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] -0.20 5 SEQ_THROUGHPUT 65.50 H100
llama_v3.1_8b_instruct-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192 H100-PyTorch-Perf-1/perf/test_perf.py::test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] test_perf_metric_token_throughput[llama_v3.1_8b_instruct-subtype:H100-bench-_autodeploy-float16-maxbs:512-maxnt:2048-input_output_len:128,128-reqs:8192] -0.20 500 TOKEN_THROUGHPUT 8384.00 H100

View File

@ -0,0 +1,108 @@
#!/bin/bash
# cleanup_jobs.sh - Cancel all SLURM jobs tracked in jobs.txt
#
# This script is designed to run in GitLab CI after_script to ensure
# all SLURM jobs are cancelled when the pipeline is interrupted, cancelled,
# or times out.
#
# Usage:
# bash cleanup_jobs.sh
#
# Environment variables:
# OUTPUT_PATH: Directory containing jobs.txt and pytest.pid
set -e
OUTPUT_PATH="${OUTPUT_PATH:-/tmp}"
JOBS_FILE="${OUTPUT_PATH}/jobs.txt"
PID_FILE="${OUTPUT_PATH}/pytest.pid"
echo "=========================================="
echo "SLURM Job Cleanup Script"
echo "=========================================="
echo "Output path: $OUTPUT_PATH"
echo ""
# Show pytest PID if available (for debugging)
if [ -f "$PID_FILE" ]; then
PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
echo "Pytest PID: $PYTEST_PID"
# Check if pytest is still running
if kill -0 "$PYTEST_PID" 2>/dev/null; then
echo "Status: Still running"
else
echo "Status: Already terminated"
fi
echo ""
else
echo "No pytest.pid found (test may not have started)"
echo ""
fi
# Check if jobs.txt exists
if [ ! -f "$JOBS_FILE" ]; then
echo "[WARN] No jobs.txt found"
echo " Nothing to cancel"
echo "=========================================="
exit 0
fi
echo "[INFO] Reading jobs from: $JOBS_FILE"
# Read, deduplicate, and filter empty lines
JOBS=$(sort -u "$JOBS_FILE" | grep -v '^$' || true)
if [ -z "$JOBS" ]; then
echo "[WARN] jobs.txt is empty"
echo " Nothing to cancel"
echo "=========================================="
exit 0
fi
JOB_COUNT=$(echo "$JOBS" | wc -l)
echo "Found $JOB_COUNT job(s) to cancel"
echo ""
# Cancel each job
CANCELLED=0
ALREADY_DONE=0
FAILED=0
echo "Cancelling jobs..."
while IFS= read -r job_id; do
if [ -n "$job_id" ]; then
printf " %-12s ... " "$job_id"
# Try to cancel the job
if scancel "$job_id" 2>/dev/null; then
echo "[OK] Cancelled"
CANCELLED=$((CANCELLED + 1))
else
# Check if job exists in squeue
if squeue -j "$job_id" -h 2>/dev/null | grep -q "$job_id"; then
echo "[FAIL] Failed to cancel"
FAILED=$((FAILED + 1))
else
echo "[SKIP] Already finished"
ALREADY_DONE=$((ALREADY_DONE + 1))
fi
fi
fi
done <<< "$JOBS"
echo ""
echo "=========================================="
echo "[DONE] Cleanup completed"
echo " Total: $JOB_COUNT"
echo " Cancelled: $CANCELLED"
echo " Already done: $ALREADY_DONE"
echo " Failed: $FAILED"
echo "=========================================="
# Exit with error if any cancellation actually failed
if [ $FAILED -gt 0 ]; then
exit 1
fi
exit 0

View File

@ -151,6 +151,7 @@ class BatchManager:
self.submitted_batches = set() # Track which batch numbers have been submitted
self.job_mapping = {} # Map test_id -> SLURM job_id
self.submit_errors = {} # Map test_id -> error message (validation/submission failures)
self.all_configs = [] # Ordered list of all test configs
logger.info(f"\n{'=' * 70}")
@ -214,6 +215,8 @@ class BatchManager:
batch_num: Batch number to submit (0-indexed)
"""
from execution.executor import JobManager
from utils.config_validator import ConfigValidator
from utils.job_tracker import JobTracker
# Calculate batch range
if self.batch_size:
@ -230,33 +233,56 @@ class BatchManager:
logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
logger.info(f"{'=' * 70}\n")
# Submit all jobs in this batch
# Pre-validate all configs before submission
logger.info("Pre-validating configurations...")
valid_configs = []
for config in batch_configs:
try:
ConfigValidator.validate_test_config(config)
valid_configs.append(config)
except Exception as e:
# Validation failed - mark as None and record error
self.job_mapping[config.test_id] = None
self.submit_errors[config.test_id] = f"Validation failed: {str(e)}"
logger.error(f" [FAILED] Validation failed: {config.test_id}")
logger.error(f" Error: {str(e)[:100]}")
logger.info(
f"Validation complete: {len(valid_configs)}/{len(batch_configs)} configs valid\n"
)
# Submit only valid configs
success_count = 0
for i, config in enumerate(batch_configs, 1):
for i, config in enumerate(valid_configs, 1):
try:
success, job_id = JobManager.submit_test_job(config)
if success and job_id:
self.job_mapping[config.test_id] = job_id
JobTracker.record_job(job_id) # Record job ID for cleanup
success_count += 1
# Truncate test_id for display
display_id = (
config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
logger.success(
f" [{i:3d}/{len(valid_configs)}] Job {job_id} <- {config.test_id}"
)
logger.success(f" [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
else:
# Submission failed - mark as None and record error
self.job_mapping[config.test_id] = None
logger.error(f" [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}")
self.submit_errors[config.test_id] = f"Job submission failed: {job_id}"
logger.error(f" [{i:3d}/{len(valid_configs)}] Failed: {config.test_id}")
except Exception as e:
# Submission exception - mark as None and record error
self.job_mapping[config.test_id] = None
logger.error(f" [{i:3d}/{len(batch_configs)}] Error: {e}")
self.submit_errors[config.test_id] = f"Submission exception: {str(e)}"
logger.error(f" [{i:3d}/{len(valid_configs)}] Error: {e}")
# Mark batch as submitted
self.submitted_batches.add(batch_num)
logger.info(f"\n{'=' * 70}")
logger.success(
f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded"
f"Batch {batch_num} Complete: {success_count}/{len(valid_configs)} submitted successfully"
)
if len(valid_configs) < len(batch_configs):
logger.warning(f"Skipped {len(batch_configs) - len(valid_configs)} invalid config(s)")
logger.info(f"{'=' * 70}\n")

View File

@ -271,7 +271,7 @@ class JobManager:
@staticmethod
def backup_logs(
job_id: str,
job_id: Optional[str],
test_config,
result_dir: str,
is_passed: bool,
@ -279,13 +279,18 @@ class JobManager:
"""Backup logs and config files to test_id directory.
Args:
job_id: SLURM job ID
job_id: SLURM job ID (None if submission failed)
test_config: TestConfig object
result_dir: Result directory path (already named as test_id)
is_passed: Whether the job passed
Returns:
Final directory path if successful, None otherwise
"""
if job_id is None:
logger.warning(f"Job submission failed for {test_config.test_id}")
else:
logger.info(f"Backing up logs for job {job_id} ({test_config.test_id})")
if not os.path.exists(result_dir):
logger.warning(f"Result directory does not exist yet: {result_dir}")
return None

View File

@ -92,6 +92,13 @@ class HypothesisTestingParams:
# Dataset default parameters for hypothesis testing
# Extracted from accuracy_core.py AccuracyTask subclasses
DATASET_DEFAULTS = {
"aime25": {
"alpha": 0.05,
"beta": 0.2,
"sigma": 50,
"num_samples": 30, # AIME 2025 full sample size
"higher_is_better": True,
},
"gsm8k": {
"alpha": 0.05,
"beta": 0.2,
@ -127,6 +134,14 @@ DATASET_DEFAULTS = {
"num_samples": 198,
"higher_is_better": True,
},
# Alias for gpqa_diamond (same task, different naming convention)
"gpqa_diamond_cot_zeroshot": {
"alpha": 0.05,
"beta": 0.2,
"sigma": 50,
"num_samples": 198,
"higher_is_better": True,
},
"json_mode_eval": {
"alpha": 0.05,
"beta": 0.2,

View File

@ -22,44 +22,18 @@ cd "$WORK_DIR"
python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
echo "System information collection completed"
# Step 2: Handle different installation modes
echo ""
echo "Step 2: Installing TensorRT-LLM..."
# Step 2: Collect TensorRT-LLM version information (only for none mode)
if [ "$INSTALL_MODE" = "none" ]; then
echo "Using built-in TensorRT-LLM, skipping installation"
elif [ "$INSTALL_MODE" = "wheel" ]; then
echo "Installing TensorRT-LLM wheel..."
echo "Wheel path pattern: $WHEEL_PATH"
# Expand wildcard and install
for wheel_file in $WHEEL_PATH; do
if [ -f "$wheel_file" ]; then
echo "Found wheel: $wheel_file"
pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
break
fi
done
echo "Wheel installation completed"
elif [ "$INSTALL_MODE" = "source" ]; then
echo "Installing TensorRT-LLM from source..."
cd "$REPO_DIR"
pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
echo "Source installation completed"
echo ""
echo "Step 2: Collecting TensorRT-LLM version information..."
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
echo "TensorRT-LLM version written to: $VERSION_FILE"
else
echo "ERROR: Invalid install mode: $INSTALL_MODE"
exit 1
echo ""
echo "Step 2: Skipping TensorRT-LLM version collection (install_mode=$INSTALL_MODE)"
fi
# Step 3: Collect TensorRT-LLM version information
echo ""
echo "Step 3: Collecting TensorRT-LLM version information..."
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
echo "TensorRT-LLM version written to: $VERSION_FILE"
echo ""
echo "=========================================="
echo "Session Collect Job Completed"

View File

@ -77,12 +77,12 @@ worker_config:
stream_interval: 20
num_postprocess_workers: 4
ctx:
max_batch_size: 8
max_batch_size: 1
max_num_tokens: 131104
max_seq_len: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: true
enable_attention_dp: false
pipeline_parallel_size: 8
print_iter_log: true
cuda_graph_config: null

View File

@ -22,7 +22,7 @@ benchmark:
multi_round: 8
benchmark_ratio: 0.8
streaming: true
concurrency_list: '6144'
concurrency_list: '1024'
input_length: 1024
output_length: 1024
dataset_file: <dataset_file>

View File

@ -81,6 +81,9 @@ worker_config:
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
nvfp4_gemm_config:
allowed_backends:
- cutlass
@ -89,7 +92,7 @@ worker_config:
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: NIXLf
backend: NIXL
stream_interval: 20
num_postprocess_workers: 4
ctx:

View File

@ -81,6 +81,9 @@ worker_config:
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
nvfp4_gemm_config:
allowed_backends:
- cutlass

View File

@ -82,6 +82,9 @@ worker_config:
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
nvfp4_gemm_config:
allowed_backends:
- cutlass

View File

@ -82,6 +82,9 @@ worker_config:
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
nvfp4_gemm_config:
allowed_backends:
- cutlass

View File

@ -81,6 +81,9 @@ worker_config:
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
nvfp4_gemm_config:
allowed_backends:
- cutlass

View File

@ -81,6 +81,9 @@ worker_config:
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
nvfp4_gemm_config:
allowed_backends:
- cutlass

View File

@ -47,6 +47,11 @@ else:
@pytest.fixture(scope="session", autouse=True)
def session_lifecycle():
"""Session lifecycle management."""
from utils.job_tracker import JobTracker
# Record pytest main process PID for GitLab CI cleanup
JobTracker.record_pid()
session_tracker.start()
try:
yield
@ -66,11 +71,8 @@ class TestDisaggBenchmark:
"""Performance benchmark test for YAML configurations."""
full_test_name = request.node.name
# Validate configuration first (before any other operations)
try:
ConfigValidator.validate_test_config(test_config)
except Exception as e:
pytest.fail(f"Configuration validation failed: {e}")
# Note: Configuration validation is done during batch submission (in conftest.py)
# If validation failed, job_id will be None and the assert below will fail
# Create test case tracker
test_tracker = TestCaseTracker()
@ -104,8 +106,11 @@ class TestDisaggBenchmark:
# Get job_id from batch manager (auto-submits batch if needed)
job_id = batch_manager.get_job_id(test_config)
# Validate submission result
assert job_id, f"Failed to get job_id for {test_config.test_id}"
# Validate submission result (will be None if validation/submission failed)
error_msg = batch_manager.submit_errors.get(
test_config.test_id, "Check batch submission logs for details"
)
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
# Wait for completion (timeout: 10 hours = 36000 seconds)
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
@ -125,13 +130,12 @@ class TestDisaggBenchmark:
raise e
finally:
# Always backup logs, regardless of success or failure
if job_id:
result_dir = JobManager.get_result_dir(test_config)
is_passed = result.get("success", False) if result else False
try:
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
except Exception as backup_error:
logger.error(f"Failed to backup logs: {backup_error}")
result_dir = JobManager.get_result_dir(test_config)
is_passed = result.get("success", False) if result else False
try:
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
except Exception as backup_error:
logger.error(f"Failed to backup logs: {backup_error}")
@pytest.mark.accuracy
@pytest.mark.parametrize("test_config", ACCURACY_TEST_CASES)
@ -204,13 +208,12 @@ class TestDisaggBenchmark:
raise e
finally:
# Always backup logs, regardless of success or failure
if job_id:
result_dir = JobManager.get_result_dir(test_config)
is_passed = result.get("success", False) if result else False
try:
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
except Exception as backup_error:
logger.error(f"Failed to backup logs: {backup_error}")
result_dir = JobManager.get_result_dir(test_config)
is_passed = result.get("success", False) if result else False
try:
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
except Exception as backup_error:
logger.error(f"Failed to backup logs: {backup_error}")
@pytest.mark.stress
@pytest.mark.parametrize("test_config", STRESS_TEST_CASES)
@ -222,11 +225,8 @@ class TestDisaggBenchmark:
"""
full_test_name = request.node.name
# Validate configuration first (before any other operations)
try:
ConfigValidator.validate_test_config(test_config)
except Exception as e:
pytest.fail(f"Configuration validation failed: {e}")
# Note: Configuration validation is done during batch submission (in conftest.py)
# If validation failed, job_id will be None and the assert below will fail
# Create test case tracker
test_tracker = TestCaseTracker()
@ -266,8 +266,11 @@ class TestDisaggBenchmark:
# Get job_id from batch manager (auto-submits batch if needed)
job_id = batch_manager.get_job_id(test_config)
# Validate submission result
assert job_id, f"Failed to get job_id for {test_config.test_id}"
# Validate submission result (will be None if validation/submission failed)
error_msg = batch_manager.submit_errors.get(
test_config.test_id, "Check batch submission logs for details"
)
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
# Wait for completion (timeout: 10 hours = 36000 seconds)
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
@ -287,13 +290,12 @@ class TestDisaggBenchmark:
raise e
finally:
# Always backup logs, regardless of success or failure
if job_id:
result_dir = JobManager.get_result_dir(test_config)
is_passed = result.get("success", False) if result else False
try:
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
except Exception as backup_error:
logger.error(f"Failed to backup logs: {backup_error}")
result_dir = JobManager.get_result_dir(test_config)
is_passed = result.get("success", False) if result else False
try:
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
except Exception as backup_error:
logger.error(f"Failed to backup logs: {backup_error}")
if __name__ == "__main__":

View File

@ -0,0 +1,61 @@
"""Simple job and process tracker for GitLab CI cleanup."""
import os
from utils.common import EnvManager
from utils.logger import logger
class JobTracker:
"""Track SLURM job IDs and pytest PID for GitLab CI cleanup."""
@staticmethod
def get_jobs_file() -> str:
"""Get jobs.txt file path in output_path."""
output_path = EnvManager.get_output_path()
return os.path.join(output_path, "jobs.txt")
@staticmethod
def get_pid_file() -> str:
"""Get pytest.pid file path in output_path."""
output_path = EnvManager.get_output_path()
return os.path.join(output_path, "pytest.pid")
@staticmethod
def record_pid():
"""Record pytest main process PID to pytest.pid file."""
pid = os.getpid()
pid_file = JobTracker.get_pid_file()
try:
# Ensure output directory exists
os.makedirs(os.path.dirname(pid_file), exist_ok=True)
# Write PID
with open(pid_file, "w") as f:
f.write(f"{pid}\n")
f.flush()
logger.info(f"Recorded pytest PID: {pid} -> {pid_file}")
except Exception as e:
logger.warning(f"Failed to record PID: {e}")
@staticmethod
def record_job(job_id: str):
"""Append SLURM job ID to jobs.txt file.
Args:
job_id: SLURM job ID to record
"""
jobs_file = JobTracker.get_jobs_file()
try:
# Ensure output directory exists
os.makedirs(os.path.dirname(jobs_file), exist_ok=True)
# Append job ID
with open(jobs_file, "a") as f:
f.write(f"{job_id}\n")
f.flush()
logger.debug(f"Recorded SLURM job: {job_id}")
except Exception as e:
logger.warning(f"Failed to record job ID {job_id}: {e}")

View File

@ -79,6 +79,8 @@ class SessionTracker:
Uses the new sbatch-based approach for non-blocking execution.
Submits the job and waits for completion using JobManager.
"""
from utils.job_tracker import JobTracker
self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logger.info(f"Session ended: {self.end_time}")
@ -89,6 +91,9 @@ class SessionTracker:
logger.error(f"Failed to submit session collect job: {job_id}")
return False
# Record session collect job ID for cleanup
JobTracker.record_job(job_id)
# Wait for job completion (reuses wait_for_completion method)
logger.info(f"Waiting for session collect job {job_id} to complete...")
JobManager.wait_for_completion(

View File

@ -22,7 +22,7 @@ import sys
import time
from datetime import datetime
from defs.trt_test_alternative import print_error, print_info, print_warning
from defs.trt_test_alternative import print_info, print_warning
_project_root = os.path.abspath(
os.path.join(os.path.dirname(__file__), '../../../..'))
@ -78,6 +78,7 @@ PRE_MERGE_THRESHOLD = 0.1
# scenario, allowing the underlying config to change while still comparing against baselines
# for the same scenario.
SCENARIO_MATCH_FIELDS = [
"s_gpu_type",
"s_runtime",
"s_model_name",
"l_isl",
@ -282,28 +283,27 @@ def query_history_data(common_values_dict):
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned no response"
)
return None
else:
payload = res.json().get("hits", {}).get("hits", [])
if len(payload) == 0:
# No history data found in database, return empty list
print_info(
f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
)
return []
for hit in payload:
data_dict = hit.get("_source", {})
data_dict["_id"] = hit.get("_id", "")
if data_dict["_id"] == "":
print_info(
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
)
# Invalid data, return None
return None
data_list.append(data_dict)
payload = res.json().get("hits", {}).get("hits", [])
if len(payload) == 0:
# No history data found in database, return empty list
print_info(
f"Successfully queried from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
)
return data_list
return []
for hit in payload:
data_dict = hit.get("_source", {})
data_dict["_id"] = hit.get("_id", "")
if data_dict["_id"] == "":
print_info(
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
)
# Invalid data, return None
return None
data_list.append(data_dict)
print_info(
f"Successfully queried from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
)
return data_list
except Exception as e:
print_info(
f"Failed to query from {TEST_INFO_PROJECT_NAME}, returned error: {e}"
@ -522,7 +522,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
# Add metric info to s_regression_info
metric_info = (f"{metric}'s value: {new_value} "
f"baseline value: {baseline_value} "
f"threshold: {threshold} "
f"threshold: {threshold * 100:.2f}% "
f"diff: {diff:+.2f}%")
info_parts.append(metric_info)
@ -643,65 +643,19 @@ def _get_metric_keys():
return metric_keys
def _print_perf_data(data):
"""Print performance metrics and config for a single data entry."""
print_info("=== Metrics ===")
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
if metric in data:
value = data.get(metric, "N/A")
print_info(f'"{metric}": {value}')
metric_keys = _get_metric_keys()
print_info("\n=== Config ===")
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
for key in config_keys:
value = data[key]
print_info(f'"{key}": {value}')
def _print_regression_data(data, print_func=None):
"""
Print regression info, metrics with baselines/thresholds, and config.
Print regression info and config.
"""
if print_func is None:
print_func = print_info
if "s_regression_info" in data:
print_func("=== Regression Info ===")
print_func(f"{data['s_regression_info']}")
for item in data["s_regression_info"].split(","):
print_func(item.strip())
metric_keys = _get_metric_keys()
is_post_merge = data.get("b_is_post_merge", False)
print_func("=== Metrics ===")
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
metric_suffix = metric[2:] # Strip "d_" prefix
baseline_key = f"d_baseline_{metric_suffix}"
if is_post_merge:
threshold_key = f"d_threshold_post_merge_{metric_suffix}"
else:
threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
# Only print if at least one of the keys exists
if metric in data or baseline_key in data or threshold_key in data:
value = data.get(metric, "N/A")
baseline = data.get(baseline_key, "N/A")
threshold = data.get(threshold_key, "N/A")
# Calculate percentage difference between value and baseline
# Positive percentage means better perf, negative means regression
if (isinstance(value, (int, float))
and isinstance(baseline, (int, float)) and baseline != 0):
if metric in MAXIMIZE_METRICS:
# Larger is better: value > baseline is positive (better)
percentage = (value - baseline) / baseline * 100
else:
# Smaller is better: value < baseline is positive (better)
percentage = (baseline - value) / baseline * 100
percentage_str = f"{percentage:+.2f}%"
else:
percentage_str = "N/A"
print_func(
f'"{metric}": {value}, "{baseline_key}": {baseline}, '
f'"{threshold_key}": {threshold}, "diff": {percentage_str}')
print_func("\n=== Config ===")
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
@ -712,16 +666,17 @@ def _print_regression_data(data, print_func=None):
print_func(f'"{key}": {value}')
def check_perf_regression(new_data_dict):
def check_perf_regression(new_data_dict, fail_on_regression=False):
"""
Check performance regression by printing regression data from new_data_dict.
If fail_on_regression is True, raises RuntimeError when regressions are found.
(This is a temporary feature to fail regression tests. We are observing the stability and will fail them by default soon.)
"""
# Filter regression data from new_data_dict
regressive_data_list = [
data for data in new_data_dict.values()
if data.get("b_is_regression", False)
]
# Split regression data into post-merge and pre-merge
post_merge_regressions = [
data for data in regressive_data_list
@ -735,24 +690,34 @@ def check_perf_regression(new_data_dict):
# Print pre-merge regression data with print_warning
if len(pre_merge_regressions) > 0:
print_warning(
f"Found {len(pre_merge_regressions)} pre-merge regression data")
f"Found {len(pre_merge_regressions)} pre-merge perf regression data"
)
for i, data in enumerate(pre_merge_regressions):
print_warning(f"\n{'=' * 60}")
print_warning(f"Pre-merge Regression Data #{i + 1}")
print_warning("=" * 60)
_print_regression_data(data, print_func=print_warning)
# Print post-merge regression data with print_error
if fail_on_regression:
raise RuntimeError(
f"Found {len(pre_merge_regressions)} pre-merge perf regression data"
)
# Print post-merge regression data with print_warning
if len(post_merge_regressions) > 0:
print_warning(
f"Found {len(post_merge_regressions)} post-merge perf regression data"
)
for i, data in enumerate(post_merge_regressions):
print_error(f"\n{'=' * 60}")
print_error(f"Post-merge Regression Data #{i + 1}")
print_error("=" * 60)
_print_regression_data(data, print_func=print_error)
print_error(
f"Found {len(post_merge_regressions)} post-merge regression data")
raise RuntimeError(
f"Found {len(post_merge_regressions)} post-merge regression data")
print_warning(f"\n{'=' * 60}")
print_warning(f"Post-merge Regression Data #{i + 1}")
print_warning("=" * 60)
_print_regression_data(data, print_func=print_warning)
if fail_on_regression:
raise RuntimeError(
f"Found {len(post_merge_regressions)} post-merge perf regression data"
)
# Print summary if no regressions
if len(regressive_data_list) == 0:

View File

@ -23,7 +23,7 @@ import re
import socket
import subprocess
import time
from typing import Dict, List, NamedTuple, Tuple
from typing import Dict, List, NamedTuple, Optional, Tuple
import pytest
import requests
@ -58,6 +58,7 @@ MODEL_PATH_DICT = {
}
SUPPORTED_GPU_TYPE = [
"H200",
"B200",
"B300",
"GB200",
@ -124,6 +125,7 @@ class ServerConfig:
self.model_name = server_config_data["model_name"]
self.model_path = ""
self.env_vars = env_vars
self.disagg_run_type = server_config_data.get("disagg_run_type", "aggr")
# Extract optional fields with defaults
self.tp = server_config_data.get("tensor_parallel_size", 1)
@ -220,9 +222,12 @@ class ServerConfig:
"concurrency",
"name",
"model_name",
"disagg_run_type",
"gpus",
"gpus_per_node",
"match_mode",
"client_configs",
"match_mode",
]
self.extra_llm_api_config_data = {
k: v for k, v in server_config_data.items() if k not in exclude_keys
@ -234,7 +239,7 @@ class ServerConfig:
"""Generate server command."""
model_dir = get_model_dir(self.model_name)
self.model_path = model_dir if os.path.exists(model_dir) else self.model_name
config_filename = f"extra-llm-api-config.{self.name}.yml"
config_filename = f"extra-llm-api-config.{self.disagg_run_type}.{self.name}.yml"
config_path = os.path.join(output_dir, config_filename)
numa_bind_cmd = []
@ -517,7 +522,9 @@ class AggrTestCmds(NamedTuple):
)
wait_for_endpoint_ready(
f"http://{server_hostname}:{server_port}/health", timeout=self.timeout
f"http://{server_hostname}:{server_port}/health",
timeout=self.timeout,
server_proc=server_proc,
)
# Run all clients for this server
@ -667,10 +674,13 @@ class DisaggTestCmds(NamedTuple):
break
time.sleep(10)
def wait_for_endpoint_ready(self, url: str):
def wait_for_endpoint_ready(self, url: str, server_files: List[str] = None):
"""Wait for endpoint to be ready."""
start = time.monotonic()
iteration = 0
error_keywords = ["RuntimeError", "out of memory", "ValueError"]
while True:
iteration += 1
elapsed_time = time.monotonic() - start
if elapsed_time > self.timeout:
print_error(
@ -678,6 +688,22 @@ class DisaggTestCmds(NamedTuple):
)
break
print_info(f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s")
if server_files and iteration % 30 == 0:
for server_file in server_files:
if os.path.exists(server_file):
try:
with open(server_file, "r") as f:
content = f.read()
for line in content.splitlines():
for keyword in error_keywords:
if keyword in line:
print_error(
f"Found '{keyword}' in server file {server_file}: {line}"
)
except Exception as e:
print_info(f"Failed to read server file {server_file}: {e}")
try:
time.sleep(10)
if requests.get(url).status_code == 200:
@ -693,7 +719,6 @@ class DisaggTestCmds(NamedTuple):
port = get_free_port()
ctx_cmd, gen_cmd, disagg_cmd = self.server_cmds[server_idx]
if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type:
self._generate_hostname_file(server_idx, port)
server_file_path = os.path.join(
@ -702,7 +727,6 @@ class DisaggTestCmds(NamedTuple):
is_ctx = "CTX" in self.disagg_serving_type
server_cmd = ctx_cmd if is_ctx else gen_cmd
server_cmd = add_host_port_to_cmd(server_cmd, self.hostname, port)
try:
print_info(
f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd}"
@ -724,7 +748,6 @@ class DisaggTestCmds(NamedTuple):
disagg_server_file_path = os.path.join(
self.output_dir, f"trtllm-serve.{server_idx}.{self.disagg_serving_type}.log"
)
try:
self._generate_disagg_server_config(server_idx, port)
print_info(f"Starting disagg server. cmd is {disagg_cmd}")
@ -746,8 +769,24 @@ class DisaggTestCmds(NamedTuple):
disagg_server_hostname, disagg_server_port = (
self._get_disagg_server_hostname_and_port(server_idx)
)
server_files = [
os.path.join(self.output_dir, f"trtllm-serve.{server_idx}.DISAGG_SERVER.log"),
]
for ctx_idx in range(self.num_ctx_servers):
server_files.append(
os.path.join(
self.output_dir, f"trtllm-serve.{server_idx}.CTX_{ctx_idx}.log"
)
)
for gen_idx in range(self.num_gen_servers):
server_files.append(
os.path.join(
self.output_dir, f"trtllm-serve.{server_idx}.GEN_{gen_idx}.log"
)
)
self.wait_for_endpoint_ready(
f"http://{disagg_server_hostname}:{disagg_server_port}/health"
f"http://{disagg_server_hostname}:{disagg_server_port}/health",
server_files=server_files,
)
# Run all clients for this server
@ -799,7 +838,6 @@ class PerfSanityTestConfig:
def __init__(self, test_case_name: str, output_dir: str):
self._output_dir = output_dir
self._test_results: Dict[int, Dict[str, float]] = {}
self._perf_results: Dict[int, List[Dict[str, float]]] = {}
# Parse test case name
@ -977,6 +1015,7 @@ class PerfSanityTestConfig:
"name": config_file_base_name,
"model_name": model_name,
"gpus_per_node": gpus_per_node,
"disagg_run_type": "ctx",
**worker_config.get("ctx", {}),
}
@ -986,6 +1025,7 @@ class PerfSanityTestConfig:
"name": config_file_base_name,
"model_name": model_name,
"gpus_per_node": gpus_per_node,
"disagg_run_type": "gen",
**worker_config.get("gen", {}),
}
@ -1047,7 +1087,7 @@ class PerfSanityTestConfig:
# Generate extra-llm-api-config.yml
config_content = server_config.generate_extra_llm_api_config()
config_filename = f"extra-llm-api-config.{server_config.name}.yml"
config_filename = f"extra-llm-api-config.aggr.{server_config.name}.yml"
config_path = os.path.join(output_dir, config_filename)
with open(config_path, "w") as f:
f.write(config_content)
@ -1080,7 +1120,9 @@ class PerfSanityTestConfig:
ctx_cmd = ctx_config.to_cmd(output_dir, numa_bind, "CTX")
if "CTX" in disagg_serving_type:
config_content = ctx_config.generate_extra_llm_api_config()
config_path = os.path.join(output_dir, "extra-llm-api-config.ctx.yml")
config_path = os.path.join(
output_dir, f"extra-llm-api-config.ctx.{ctx_config.name}.yml"
)
with open(config_path, "w") as f:
f.write(config_content)
@ -1088,7 +1130,9 @@ class PerfSanityTestConfig:
gen_cmd = gen_config.to_cmd(output_dir, numa_bind, "GEN")
if "GEN" in disagg_serving_type:
config_content = gen_config.generate_extra_llm_api_config()
config_path = os.path.join(output_dir, "extra-llm-api-config.gen.yml")
config_path = os.path.join(
output_dir, f"extra-llm-api-config.gen.{gen_config.name}.yml"
)
with open(config_path, "w") as f:
f.write(config_content)
@ -1165,44 +1209,59 @@ class PerfSanityTestConfig:
if failed_requests_match:
failed_count = int(failed_requests_match.group(1))
if failed_count > 0:
print_error(f"Benchmark output contains {failed_count} failed requests.")
raise Exception(f"Benchmark has {failed_count} failed requests")
error_msg = f"Benchmark output contains {failed_count} failed requests."
raise Exception(error_msg)
# Check for explicit failure markers
if "!FAILED REQUESTS!" in output or "!CHECK LOG FOR ERRORS!" in output:
print_error("Benchmark output contains failure markers.")
raise Exception("Benchmark output contains failure markers")
error_msg = "Benchmark output contains failure markers."
raise Exception(error_msg)
def get_perf_result(self, outputs: Dict[int, List[str]]):
"""Parse performance results from outputs."""
self._perf_results = {}
for server_idx, server_outputs in outputs.items():
self._perf_results[server_idx] = []
for output in server_outputs:
metrics = {}
def parse_metrics_from_output(output: str) -> Optional[Dict[str, float]]:
"""Parse all metrics from a single output string."""
metrics = {}
for line in output.split("\n"):
for metric_type, regex in PERF_METRIC_LOG_QUERIES.items():
regex_matches = [regex.search(line) for line in output.split("\n")]
for match in regex_matches:
if match:
value = None
for i in range(1, len(match.groups()) + 1):
if match.group(i) is not None:
value = match.group(i)
break
if value is not None:
metrics[metric_type] = float(value)
break
if metric_type in metrics:
continue
match = regex.search(line)
if match:
metrics[metric_type] = float(match.group(1))
break
return metrics
self._perf_results = {}
for server_idx, client_configs in self.server_client_configs.items():
self._perf_results[server_idx] = []
server_outputs = outputs.get(server_idx, [])
for output in server_outputs:
metrics = parse_metrics_from_output(output)
self._perf_results[server_idx].append(metrics)
# Also populate _test_results for upload (flattened view)
cmd_idx = 0
for server_idx in sorted(self._perf_results.keys()):
for client_metrics in self._perf_results[server_idx]:
self._test_results[cmd_idx] = client_metrics
cmd_idx += 1
def check_test_failure(self):
"""Check if any server failed based on perf results."""
error_msg = ""
for server_idx, client_configs in self.server_client_configs.items():
server_perf_results = self._perf_results.get(server_idx, [])
if len(server_perf_results) != len(client_configs):
error_msg += (
f"Server {server_idx}'s perf results number: {len(server_perf_results)} "
f"is not equal to client number: {len(client_configs)}. "
)
for client_idx, metrics in enumerate(server_perf_results):
if len(metrics) != len(PERF_METRIC_LOG_QUERIES):
error_msg += (
f"Some metrics in Server {server_idx} Client {client_idx} are missing. "
f"The broken metrics is {metrics}. "
)
if error_msg:
raise Exception(error_msg)
print_info("All servers passed")
def upload_test_results_to_database(self):
"""Upload test results and baseline to database."""
@ -1219,25 +1278,27 @@ class PerfSanityTestConfig:
return {add_prefix(key, prefix_name): value for key, value in config_dict.items()}
match_keys = []
is_scenario_mode = False
if self.runtime == "aggr_server":
job_config = get_job_info()
is_post_merge = job_config["b_is_post_merge"]
new_data_dict = {}
cmd_idx = 0
for server_idx, client_configs in self.server_client_configs.items():
server_config = self.server_configs[server_idx]
server_config_dict = server_config.to_db_data()
server_perf_results = self._perf_results.get(server_idx, [])
# Skip if server failed
if len(server_perf_results) != len(client_configs):
cmd_idx += len(client_configs)
continue
for client_config in client_configs:
for client_idx, client_config in enumerate(client_configs):
client_config_dict = client_config.to_db_data()
# Skip if metrics missing
if cmd_idx not in self._test_results or not all(
metric_name in self._test_results[cmd_idx]
for metric_name in PERF_METRIC_LOG_QUERIES
):
if server_perf_results[client_idx] is None:
print_info(
f"Skipped posting command {cmd_idx}'s test results since some metrics are missing."
)
@ -1257,18 +1318,18 @@ class PerfSanityTestConfig:
new_data["s_test_case_name"] = f"{server_config.name}-{client_config.name}"
for metric_name in PERF_METRIC_LOG_QUERIES:
if metric_name in self._test_results[cmd_idx]:
new_data[f"d_{metric_name}"] = self._test_results[cmd_idx][metric_name]
new_data[f"d_{metric_name}"] = server_perf_results[client_idx][metric_name]
add_id(new_data)
new_data_dict[cmd_idx] = new_data
cmd_idx += 1
if not match_keys:
match_keys.extend(["s_gpu_type", "s_runtime"])
if server_config.match_mode == "scenario":
match_keys = SCENARIO_MATCH_FIELDS.copy()
is_scenario_mode = True
else:
match_keys.extend(["s_gpu_type", "s_runtime"])
match_keys.extend(server_config.to_match_keys())
match_keys.extend(client_config.to_match_keys())
@ -1285,12 +1346,16 @@ class PerfSanityTestConfig:
for server_idx, (ctx_config, gen_config, disagg_config) in enumerate(
self.server_configs
):
for client_config in self.server_client_configs[server_idx]:
client_configs = self.server_client_configs[server_idx]
server_perf_results = self._perf_results.get(server_idx, [])
# Skip if server failed
if len(server_perf_results) != len(client_configs):
cmd_idx += len(client_configs)
continue
for client_idx, client_config in enumerate(client_configs):
# Skip if metrics missing
if cmd_idx not in self._test_results or not all(
metric_name in self._test_results[cmd_idx]
for metric_name in PERF_METRIC_LOG_QUERIES
):
if server_perf_results[client_idx] is None:
print_info(
f"Skipped posting command {cmd_idx}'s test results since some metrics are missing."
)
@ -1323,8 +1388,7 @@ class PerfSanityTestConfig:
new_data["s_test_case_name"] = f"{disagg_config.name}-{client_config.name}"
for metric_name in PERF_METRIC_LOG_QUERIES:
if metric_name in self._test_results[cmd_idx]:
new_data[f"d_{metric_name}"] = self._test_results[cmd_idx][metric_name]
new_data[f"d_{metric_name}"] = server_perf_results[client_idx][metric_name]
add_id(new_data)
new_data_dict[cmd_idx] = new_data
@ -1376,7 +1440,7 @@ class PerfSanityTestConfig:
# Upload the new perf data and baseline data to database
post_new_perf_data(new_baseline_data_dict, new_data_dict)
check_perf_regression(new_data_dict)
check_perf_regression(new_data_dict, fail_on_regression=is_scenario_mode)
# Perf sanity test case parameters
@ -1479,5 +1543,8 @@ def test_e2e(output_dir, perf_sanity_test_case):
# Parse performance results
config.get_perf_result(outputs)
# Check for test failures
config.check_test_failure()
# Upload results to database
config.upload_test_results_to_database()

View File

@ -23,15 +23,15 @@ llm_config_database:
system_gpu_count:
gte: 1
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
- condition:
wildcards:
gpu:
@ -42,15 +42,15 @@ llm_config_database:
system_gpu_count:
gte: 2
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
- condition:
wildcards:
gpu:
@ -61,21 +61,21 @@ llm_config_database:
system_gpu_count:
gte: 4
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
- condition:
wildcards:
gpu:
@ -86,27 +86,27 @@ llm_config_database:
system_gpu_count:
gte: 8
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
- condition:
wildcards:
gpu:
@ -117,15 +117,15 @@ llm_config_database:
system_gpu_count:
gte: 1
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
- condition:
wildcards:
gpu:
@ -136,15 +136,15 @@ llm_config_database:
system_gpu_count:
gte: 2
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
- condition:
wildcards:
gpu:
@ -155,15 +155,15 @@ llm_config_database:
system_gpu_count:
gte: 4
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
- condition:
wildcards:
gpu:
@ -174,18 +174,18 @@ llm_config_database:
system_gpu_count:
gte: 8
tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]

View File

@ -1,391 +1,4 @@
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaModel-bert/roberta-base]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha_fp32_acc-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertModel-bert/bert-base-uncased]
examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2]
examples/test_bindings.py::test_llm_bindings_example[llama-7b]
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-disable_weight_only]
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-enable_weight_only]
examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only]
examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only]
examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only]
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (180)
examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (180)
examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8]
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2]
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-enable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90)
examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-disable_fp8] TIMEOUT (90)
examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1] TIMEOUT (90)
examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:4] TIMEOUT (90)
examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:4] TIMEOUT (90)
examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_deep_2.4b-float16-nb:1] TIMEOUT (90)
examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1] TIMEOUT (90)
examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-2-27b-it-other-bfloat16-8]
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8]
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it]
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it]
examples/test_gpt.py::test_llm_gpt2_medium_1gpu[non_streaming-use_py_session-disable_gemm_plugin]
examples/test_gpt.py::test_llm_gpt2_medium_1gpu[streaming-use_cpp_session-enable_gemm_plugin]
examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp1pp4]
examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp2pp2]
examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp4pp1]
examples/test_gpt.py::test_llm_gpt2_medium_bad_words_1gpu[non_streaming-use_cpp_session]
examples/test_gpt.py::test_llm_gpt2_medium_stop_words_1gpu[streaming-use_cpp_session]
examples/test_gpt.py::test_llm_gpt2_multi_lora_1gpu[900_stories]
examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_cpp_session-tp1]
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-1]
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct]
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct]
examples/test_gpt.py::test_streaming_beam[batch_size_1-disable_return_all_generated_tokens-num_beams_1]
examples/test_gpt.py::test_streaming_beam[batch_size_1-disable_return_all_generated_tokens-num_beams_4]
examples/test_gpt.py::test_streaming_beam[batch_size_1-return_all_generated_tokens-num_beams_1]
examples/test_gpt.py::test_streaming_beam[batch_size_1-return_all_generated_tokens-num_beams_4]
examples/test_gpt.py::test_streaming_beam[batch_size_3-disable_return_all_generated_tokens-num_beams_1]
examples/test_gpt.py::test_streaming_beam[batch_size_3-disable_return_all_generated_tokens-num_beams_4]
examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_1]
examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_4]
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16]
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16]
examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct]
examples/test_granite.py::test_granite_bf16_lora[granite-3.0-2b-instruct]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_8-float16-bs1]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-llama_v2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
examples/test_draft_target_model.py::test_llm_draft_target_llama_1gpu
examples/test_draft_target_model.py::test_llm_draft_target_llama_fp8_2gpu
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1]
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2]
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1]
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2]
examples/test_internlm.py::test_llm_internlm2_7b_1node_1gpu[bfloat16-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:2]
examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct]
examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa]
examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa]
examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa]
examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:4]
examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_with_fp32_acc-enable_gemm_plugin-enable_attention_plugin-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-34b-Instruct-tp4pp1-nb:4]
examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-70b-hf-tp2pp2-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1]
examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4]
examples/test_llama.py::test_codellama_fp8_with_bf16_lora[CodeLlama-7b-Instruct]
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v2-7b-hf]
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf]
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b]
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-3b]
examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp8pp1-nb:1]
examples/test_llama.py::test_llm_llama_lookahead_single_gpu_summary[llama-3.1-8b]
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b]
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b]
examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
examples/test_llama.py::test_llm_llama_v1_2gpu_summary[llama-7b-nb:4]
examples/test_llama.py::test_llm_llama_v1_4gpu_paged_kv_cache[llama-3.1-8b]
examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp16]
examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_awq]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp16]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k] TIMEOUT (180)
examples/test_llama.py::test_llm_llama_v3_dora_1gpu[commonsense-llama-v3-8b-dora-r32-llama-v3-8b-hf-base_fp16]
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-1.4b-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-130m-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-2.8b-float16-disable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-370m-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-790m-float16-disable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-1.3b-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-2.7b-float16-disable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-370m-float16-enable_gemm_plugin]
examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-780m-float16-disable_gemm_plugin]
examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1]
examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1]
examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4]
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1]
# Multimodal Executor Cpp E2E Tests
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1]
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16]
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8]
examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16]
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA]
examples/test_qwen.py::test_llm_qwen1_5_moe_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA]
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen1.5_72b_chat-tp4pp2-context_fmha]
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2_72b_instruct-tp8pp1-context_fmha_fp32_acc]
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2.5_72b_chat-tp4pp2-context_fmha]
examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2.5_72b_chat-tp8pp1-context_fmha_fp32_acc]
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen1.5_7b_chat-enable_gemm_plugin-enable_weight_only]
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_7b_instruct-enable_gemm_plugin-enable_weight_only]
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only]
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only]
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_0.5b_chat-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_0.5b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-disable_fmha]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_7b_chat-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_vl_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_vl_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_0.5b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
examples/test_internlm.py::test_llm_internlm2_7b_1node_1gpu[bfloat16-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:2] # 5 mins
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-enable_fmha_fp32_acc]
examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc]
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen1.5_7b_chat-nb:4]
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_7b_instruct-nb:4]
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4]
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2.5_7b_instruct-nb:4]
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen1.5_14b_chat_int4-nb:4]
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen1.5_7b_chat_awq-nb:1]
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4]
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen1.5_7b_chat-enable_ptpc-nb:4]
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2_7b_instruct-enable_ptpc-nb:4]
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2_vl_7b_instruct-enable_ptpc-nb:4]
examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2.5_7b_instruct-enable_ptpc-nb:4]
examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16]
examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2_0.5b_instruct]
examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_0.5b_instruct]
examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct]
examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat]
examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-flax-no_paged_cache-disable_quant-float16-enable_attn_plugin-disable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin]
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin]
examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8]
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime]
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime]
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime]
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int4-float16-nb:1-use_cpp_runtime]
# Accuracy test list
accuracy/test_cli_flow.py::TestGpt2::test_auto_dtype
accuracy/test_cli_flow.py::TestGpt2::test_gemm_plugin
accuracy/test_cli_flow.py::TestGpt2::test_attention_ootb
accuracy/test_cli_flow.py::TestGpt2::test_context_fmha_disabled
accuracy/test_cli_flow.py::TestGpt2::test_context_fmha_fp32_acc
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int8]
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4]
accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache
accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=False-per_channel=False]
accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=True-per_channel=True]
accuracy/test_cli_flow.py::TestGpt2::test_beam_search
accuracy/test_cli_flow.py::TestGpt2::test_beam_search_large
accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin
accuracy/test_cli_flow.py::TestGpt2::test_cuda_graph
accuracy/test_cli_flow.py::TestGpt2Medium::test_auto_dtype
accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8
accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8_lm_head
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype
accuracy/test_cli_flow.py::TestStarcoder2_3B::test_auto_dtype
accuracy/test_cli_flow.py::TestStarcoder2_15B::test_smooth_quant_ootb
accuracy/test_cli_flow.py::TestGptNext::test_auto_dtype
accuracy/test_cli_flow.py::TestMinitron4BBase::test_auto_dtype
accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8
accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype
accuracy/test_cli_flow.py::TestPhi2::test_tp2
accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive
accuracy/test_cli_flow.py::TestMamba130M::test_auto_dtype
accuracy/test_cli_flow.py::TestVicuna7B::test_lookahead
accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False]
accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True]
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=False-chunked_context=False-typical_acceptance=False]
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False]
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False]
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True]
accuracy/test_cli_flow.py::TestLlama7B::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama7B::test_beam_search
accuracy/test_cli_flow.py::TestLlama7B::test_int4_gptq
accuracy/test_cli_flow.py::TestLlama7B::test_streamingllm
accuracy/test_cli_flow.py::TestLlama2_7B::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[tp2]
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[pp2]
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2]
accuracy/test_cli_flow.py::TestLlama2_7B::test_tp2cp2
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_plugin
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_swiglu_plugin
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2
accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_tp2
accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_prequantized_tp2
accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_gptq_prequantized_tp2
accuracy/test_cli_flow.py::TestLlama2_7B::test_weight_sparsity
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_float32
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only[int8]
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only[int4]
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8]
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_pp4
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant]
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-enable_fused_quant]
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-disable_fused_quant]
accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant]
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_medusa_fp8_prequantized
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant_ootb
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_manage_weights
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_pp2
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_rowwise
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_weight_streaming[1.0]
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
accuracy/test_llm_api.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant
accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant_ootb
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin
accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int4_tp2
accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2
accuracy/test_cli_flow.py::TestMixtral8x7B::test_pp_reduce_scatter_tp2pp2
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[expert_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[mixed_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[no_renormalize-tensor_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-expert_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-mixed_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel]
accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized
accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 TIMEOUT (120)
accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] TIMEOUT (90)
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8]
accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4]
accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_auto_dtype
accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_fp8
accuracy/test_cli_flow.py::TestQwen2_1_5B::test_auto_dtype_cp4
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_weight_only
accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_0_5BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
# text generation accuracy test
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
@ -418,6 +31,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
@ -428,14 +45,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
@ -464,8 +73,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
@ -492,9 +99,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2]
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
@ -522,13 +126,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[disable
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
@ -536,40 +139,18 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
@ -621,12 +202,98 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_m
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model]
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm]
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus
# multimodal accuracy tests
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestNano_V2_VLM::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
# disaggregated serving accuracy test
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True]
@ -653,63 +320,12 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus
accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestNano_V2_VLM::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
# e2e test
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
test_e2e.py::test_llama_e2e[use_py_session--]
llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-codellama/CodeLlama-7b-Instruct-hf] # 5min
llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf] # 5min
test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--]
test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--]
test_e2e.py::test_mistral_e2e[use_py_session---]
test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
@ -720,9 +336,6 @@ test_e2e.py::test_openai_chat_harmony
test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B]
test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b]
test_e2e.py::test_trtllm_multimodal_benchmark_serving
llmapi/test_llm_examples.py::test_llmapi_server_example
# Pivot to Pytorch test cases.
test_e2e.py::test_ptp_quickstart
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
@ -766,47 +379,9 @@ test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-
test_e2e.py::test_eagle3_output_consistency_4gpus[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-Llama-4-Maverick-17B-128E-Eagle3]
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-Qwen3/qwen3-235B-eagle3]
unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora
examples/test_medusa.py::test_codellama_medusa_1gpu[CodeLlama-7b-Instruct]
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen_7b_chat]
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen1.5_7b_chat]
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2_7b_instruct]
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2_0.5b_instruct]
examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2.5_1.5b_instruct]
examples/test_medusa.py::test_phi_medusa_1gpu[phi-2]
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3-mini-128k-instruct]
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3-small-128k-instruct]
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3.5-mini-instruct]
examples/test_medusa.py::test_phi_medusa_1gpu[Phi-4-mini-instruct]
examples/test_eagle.py::test_codellama_eagle_1gpu[CodeLlama-7b-Instruct-eagle1]
examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle1]
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle1]
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle1]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle1]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle1]
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle1]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle1]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle1]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle1]
examples/test_eagle.py::test_codellama_eagle_1gpu[CodeLlama-7b-Instruct-eagle2]
examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle2]
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle2]
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle2]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle2]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle2]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle2]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle2]
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle2]
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2]
llmapi/test_llm_examples.py::test_llmapi_server_example
# e2e serve test
examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options]
examples/serve/test_serve.py::test_config_file_loading[--config]
examples/serve/test_serve.py::test_env_overrides_pdl
@ -827,8 +402,7 @@ examples/serve/test_serve_negative.py::test_malformed_json_request
examples/serve/test_serve_negative.py::test_missing_content_type_header
examples/serve/test_serve_negative.py::test_extremely_large_batch
# PyTorch flow disaggregated tests
# e2e disaggregated serving test
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]

View File

@ -1,39 +1,53 @@
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp1]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2]
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
# text generation accuracy test
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
@ -61,20 +75,28 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[disable
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto]
@ -122,117 +144,29 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
# multimodal accuracy tests
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
@ -240,28 +174,41 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin]
disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin]
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin]
disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin]
# disaggregated serving accuracy test
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp1]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2]
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
# e2e test
test_e2e.py::test_openai_chat_harmony
test_e2e.py::test_openai_consistent_chat
test_e2e.py::test_openai_multi_chat_example
@ -291,3 +238,27 @@ test_e2e.py::test_trtllm_multimodal_benchmark_serving
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf-Qwen3/qwen3-235B-eagle3]
test_e2e.py::test_eagle3_output_consistency_4gpus[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-Llama-4-Maverick-17B-128E-Eagle3]
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-Qwen3/qwen3-235B-eagle3]
# e2e disaggregated serving test
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin]
disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin]
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin]
disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin]

View File

@ -179,7 +179,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True]

View File

@ -9,7 +9,6 @@ llm_perf_sanity:
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
# 6: H20, H100, H200, B200, B300, RTX6000-Server
# 7: H20, H100, H200, B200, B300
# ===============================================================================
# 1: All GPUs
@ -31,6 +30,7 @@ llm_perf_sanity:
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32]
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
- perf/test_perf.py::test_perf[starcoder2_7b-bench-pytorch-bfloat16-input_output_len:512,512]
# Phi-4-multimodal-instruct
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
# Bielik-11B-v2.2-Instruct
@ -124,25 +124,9 @@ llm_perf_sanity:
# for chunked prefill cases
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200]
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(100)
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(100)
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
# disagg server cases
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
- perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
# gpt_oss_20b_fp4
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
# 7: H20, H100, H200, B200, B300
- condition:
ranges:
system_gpu_count:
gte: 8
compute_capability:
gte: 9.0
lt: 12.0
tests:
# chunked attention case
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]

View File

@ -82,6 +82,7 @@ l0_b200:
- unittest/_torch/modeling -k "modeling_llama"
- unittest/_torch/modeling -k "modeling_mixtral"
- unittest/_torch/modeling -k "modeling_gpt_oss"
- unittest/_torch/modeling/test_modeling_exaone_moe.py
- unittest/tools/test_layer_wise_benchmarks.py::test_deepseek_r1_ctx_dep[1]
- unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1]
- unittest/_torch/modeling/test_modeling_exaone4.py::TestEXAONE4::test_llm_load_1_FP8

View File

@ -43,6 +43,7 @@ l0_dgx_h100:
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-False]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
- unittest/llmapi/apps/test_disagg_serving_perf_metrics.py
- disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
# llmapi

View File

@ -42,5 +42,4 @@ l0_perf:
stage: pre_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-_autodeploy-float16-input_output_len:128,128-reqs:8192]
- perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-_autodeploy-float16-input_output_len:1024,1024-reqs:512]

View File

@ -1,6 +1,5 @@
examples/test_openai.py::test_llm_openai_triton_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654)
examples/test_openai.py::test_llm_openai_triton_plugingen_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654)
full:GH200/examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (arm is not supported)
full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported)
full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] SKIP (arm is not supported)
full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported)
@ -13,15 +12,10 @@ full:GH200/examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int
perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449)
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2 SKIP (not supported yet)
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514)
examples/test_qwen.py::test_llm_qwen1_5_moe_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/4781396)
perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization)
perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization)
cpp/test_e2e.py::test_model[-encoder-90] SKIP (waive Encoder-only test because it doesn't take batched input)
full:L40S/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] SKIP (skip on L40S commit f9a0fcb0)
full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955)
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5014327)
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] SKIP (https://nvbugs/5000026)
@ -31,7 +25,6 @@ examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-full_prec] SKIP
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8] SKIP (https://nvbugs/5000026)
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec] SKIP (https://nvbugs/5000026)
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq] SKIP (https://nvbugs/5000026)
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5000026)
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
@ -47,42 +40,15 @@ full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blac
full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell)
full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96)
full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12)
full:sm100/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (Disable for Blackwell OOM)
full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM)
full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12)
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5064768)
test_e2e.py::test_openai_consistent_chat SKIP (https://nvbugs/5112075)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp16] SKIP (https://nvbugs/5114678)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] SKIP (https://nvbugs/5135328)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5141288)
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141)
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)
full:L40S/accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5176867)
full:L40S/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2 SKIP (https://nvbugs/5176867)
full:L40S/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights SKIP (https://nvbugs/5176867)
full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)
full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
@ -106,10 +72,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-rec
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697)
examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle1] SKIP (https://nvbugs/5219535)
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle1] SKIP (https://nvbugs/5219535)
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle1] SKIP (https://nvbugs/5219535)
examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle2] SKIP (https://nvbugs/5219535)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
@ -132,15 +94,10 @@ full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[deepseek
full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugspro.nvidia.com/bug/5150255)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405)
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058)
full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] SKIP (https://nvbugs/5273695)
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
@ -186,33 +143,19 @@ examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552)
unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] SKIP (https://nvbugs/5435714)
test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075)
examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488)
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043)
full:L40S/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
full:L20/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8] SKIP (https://nvbugs/5380570)
test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-8] SKIP (https://nvbugs/5380570)
examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288)
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067)
examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5421989)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5421989)
examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132)
accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache SKIP (https://nvbugs/5433541)
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugs/5433541)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627)
@ -226,14 +169,6 @@ triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (http
triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-4-mini-instruct] SKIP (https://nvbugs/5465143)
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696)
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl SKIP (https://nvbugs/5413362)
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146)
triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392)
@ -241,17 +176,6 @@ triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5
triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378)
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5477421)
test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5448462)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5448462)
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5448479)
examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] SKIP (https://nvbugs/5465143)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5465143, 5481206 WNF)
accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5465143, 5481206 WNF)
accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075)
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143, 5481206 WNF)
accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5738168)
test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523)
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319)
@ -260,10 +184,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507)
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507)
examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507)
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] SKIP (https://nvbugs/5546507)
examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507)
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] SKIP (https://nvbugs/5546507)
examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507)
cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689)
cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689)
full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
@ -278,26 +198,9 @@ full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_f
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553)
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553)
full:GB200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] SKIP (https://nvbugs/5568052)
full:GB200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] SKIP (https://nvbugs/5568052)
full:GB200/examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] SKIP (https://nvbugs/5568052)
full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] SKIP (https://nvbugs/5568052)
full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] SKIP (https://nvbugs/5568052)
full:B200/examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] SKIP (https://nvbugs/5568052)
full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052)
accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (https://nvbugs/5451207)
accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 SKIP (https://nvbugs/5511944)
triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] SKIP (https://nvbugs/5470830)
full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search SKIP (https://nvbugs/5481075)
full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP (https://nvbugs/5568052)
full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337)
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143)
unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781)
triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
@ -317,14 +220,6 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5655832)
test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825)
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904)
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] SKIP (https://nvbugs/5664904)
@ -335,9 +230,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5756804)
examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826)
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] SKIP (https://nvbugs/5451216)
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype SKIP (https://nvbugs/5588376)
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527)
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
@ -361,9 +254,6 @@ accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/570
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] SKIP (https://nvbugs/5705194)
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2 SKIP (https://nvbugs/5705195)
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8] SKIP (https://nvbugs/5666826)
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/5707087)
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype SKIP (https://nvbugs/5707087)
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype SKIP (https://nvbugs/5707087)
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145)
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145)
@ -387,35 +277,18 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_tr
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304)
unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392)
unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476)
examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/5744293)
examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293)
examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377)
test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5744432)
test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920)
test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938)
triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981)
unittest/llmapi/test_llm_pytorch.py::test_tinyllama_logits_processor[False] SKIP (https://nvbugs/5771838)
unittest/llmapi/test_llm_pytorch.py::test_tinyllama_logits_processor[True] SKIP (https://nvbugs/5771838)
accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype SKIP (https://nvbugs/5744293)
accuracy/test_cli_flow.py::TestPhi2::test_tp2 SKIP (https://nvbugs/5744293)
accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5744293)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979)
examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5754976)
examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (https://nvbugs/5754976)
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5568052)
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/5588376)
unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698)
triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737)
@ -462,6 +335,7 @@ accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5596343)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
@ -475,7 +349,6 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)
unittest/_torch/attention/test_flashinfer_star_attn.py::TestStarAttention::test_flashinfer_star_attention[num_layers:2-num_heads:32-num_kv_heads:8-head_dim:64-anchor_size:64-block_size:64-dtype:torch.float16] SKIP (https://nvbugs/5781389)
unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:16-hidden:128] SKIP (https://nvbugs/5781383)
cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
@ -492,11 +365,9 @@ unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_in
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028)
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] SKIP (https://nvbugs/5784518)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206)
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5785465)
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5785485)
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5787855)
examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] SKIP (https://nvbugs/5787855)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/5787836)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836)
@ -515,3 +386,11 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (http
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839)
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5800591)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5800646)
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] SKIP (https://nvbugs/5800679)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5741304)
accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/5800725)
examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248)

View File

@ -1,6 +1,6 @@
server_configs:
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 4
match_mode: scenario
cuda_graph_config:
@ -31,7 +31,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 4
match_mode: scenario
cuda_graph_config:
@ -62,7 +62,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 4
match_mode: scenario
cuda_graph_config:
@ -97,7 +97,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -128,7 +128,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -159,7 +159,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -194,7 +194,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 4
match_mode: scenario
cuda_graph_config:
@ -225,7 +225,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 4
match_mode: scenario
cuda_graph_config:
@ -256,7 +256,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 4
match_mode: scenario
cuda_graph_config:
@ -291,7 +291,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -322,7 +322,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -353,7 +353,7 @@ server_configs:
backend: openai
streaming: true
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
model_name: deepseek_r1_0528_fp4_v2
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -388,7 +388,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -419,7 +419,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -450,7 +450,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -481,7 +481,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -512,7 +512,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -543,7 +543,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -578,7 +578,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -613,7 +613,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -648,7 +648,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -683,7 +683,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -718,7 +718,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -753,7 +753,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -788,7 +788,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -823,7 +823,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -858,7 +858,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -893,7 +893,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -928,7 +928,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -963,7 +963,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -998,7 +998,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1033,7 +1033,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1068,7 +1068,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1103,7 +1103,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1138,7 +1138,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1173,7 +1173,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1208,7 +1208,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1243,7 +1243,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1278,7 +1278,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1313,7 +1313,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1348,7 +1348,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1383,7 +1383,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1418,7 +1418,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1453,7 +1453,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1488,7 +1488,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1523,7 +1523,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1558,7 +1558,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1593,7 +1593,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1628,7 +1628,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1663,7 +1663,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1698,7 +1698,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1733,7 +1733,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1768,7 +1768,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1803,7 +1803,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:

View File

@ -1,6 +1,6 @@
server_configs:
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -31,7 +31,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -62,7 +62,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -93,7 +93,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -124,7 +124,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -155,7 +155,7 @@ server_configs:
backend: openai
streaming: true
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
model_name: deepseek-ai/DeepSeek-R1-0528
model_name: deepseek_r1_0528_fp8
gpus: 8
match_mode: scenario
cuda_graph_config:
@ -190,7 +190,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -224,7 +224,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -258,7 +258,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -292,7 +292,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -326,7 +326,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -360,7 +360,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -394,7 +394,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -428,7 +428,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -462,7 +462,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -496,7 +496,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -530,7 +530,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -564,7 +564,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -598,7 +598,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -632,7 +632,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -666,7 +666,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -700,7 +700,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -734,7 +734,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -768,7 +768,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -802,7 +802,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -836,7 +836,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -870,7 +870,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -904,7 +904,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -938,7 +938,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -972,7 +972,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1006,7 +1006,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1040,7 +1040,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1074,7 +1074,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 1
match_mode: scenario
env_overrides:
@ -1108,7 +1108,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1142,7 +1142,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1176,7 +1176,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 2
match_mode: scenario
env_overrides:
@ -1210,7 +1210,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1244,7 +1244,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1278,7 +1278,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 4
match_mode: scenario
env_overrides:
@ -1312,7 +1312,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1346,7 +1346,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:
@ -1380,7 +1380,7 @@ server_configs:
backend: openai
streaming: true
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
model_name: openai/gpt-oss-120b
model_name: gpt_oss_120b_fp4
gpus: 8
match_mode: scenario
env_overrides:

View File

@ -131,7 +131,7 @@ server_configs:
iterations: 5
isl: 8192
osl: 1024
random_range_ratio: 0.8
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
@ -161,7 +161,7 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.8
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -191,7 +191,7 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.8
random_range_ratio: 0.2
backend: "openai"
# 1k8k configs

View File

@ -1,11 +1,18 @@
import subprocess
import time
import requests
def wait_for_endpoint_ready(url: str, timeout: int = 300):
def wait_for_endpoint_ready(url: str, timeout: int = 300, server_proc: subprocess.Popen = None):
start = time.monotonic()
while time.monotonic() - start < timeout:
if server_proc is not None:
exit_code = server_proc.poll()
if exit_code is not None:
raise RuntimeError(
f"Server process exited with code {exit_code} before becoming ready."
)
try:
time.sleep(1)
if requests.get(url, timeout=5).status_code == 200:

View File

@ -0,0 +1,406 @@
import unittest
from copy import deepcopy
from dataclasses import dataclass
import torch
from _torch.helpers import create_mock_cuda_graph_runner
from parameterized import parameterized
import tensorrt_llm
from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
from tensorrt_llm._torch.metadata import KVCacheParams
from tensorrt_llm._torch.model_config import ModelConfig
from tensorrt_llm._torch.models.modeling_exaone_moe import ExaoneMoeForCausalLM
from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
from tensorrt_llm.bindings.executor import KvCacheConfig
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models.modeling_utils import QuantConfig
from utils.util import getSMVersion # isort: skip
# fmt: off
# TODO: Remove this once we have a proper transformers package
from tensorrt_llm._torch.models.modeling_exaone_moe import ExaoneMoEConfig # isort: skip
SKIP_EXAONE_MOE_HF_ACCURACY_TEST = False
try:
from transformers.models.exaone_moe.modeling_exaone_moe import (
ExaoneMoEForCausalLM as HFExaoneMoEForCausalLM,
)
except ImportError:
# TODO: Remove this once we have a proper config for EXAONE-MoE
SKIP_EXAONE_MOE_HF_ACCURACY_TEST = True
# fmt: on
WINDOW_SIZE = 4
NUM_HIDDEN_LAYERS = 4
EXAONE_MOE_CONFIG = {
"architectures": ["ExaoneMoEForCausalLM"],
"attention_dropout": 0.0,
"bos_token_id": 1,
"dtype": "bfloat16",
"eos_token_id": 53,
"first_last_k_dense_replace": 1,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 6144,
"initializer_range": 0.02,
"intermediate_size": 18432,
"is_moe_layer": [False] + [True] * (NUM_HIDDEN_LAYERS - 1),
"layer_types": [
"sliding_attention",
"sliding_attention",
"sliding_attention",
"full_attention",
],
"max_position_embeddings": 262144,
"model_type": "exaone_moe",
"moe_intermediate_size": 2048,
"n_group": 1,
"norm_topk_prob": True,
"num_attention_heads": 64,
"num_experts": 128,
"num_experts_per_tok": 8,
"num_hidden_layers": NUM_HIDDEN_LAYERS,
"num_key_value_heads": 8,
"num_shared_experts": 1,
"pad_token_id": 0,
"rms_norm_eps": 1e-05,
"rope_scaling": None,
"rope_theta": 1000000,
"routed_scaling_factor": 2.5,
"scoring_func": "sigmoid",
"sliding_window": WINDOW_SIZE,
"sliding_window_pattern": "LLLG",
"tie_word_embeddings": False,
"tokenizer_class": "GPT2Tokenizer",
"topk_group": 1,
"topk_method": "noaux_tc",
"transformers_version": "5.0.0.dev0",
"use_cache": True,
"vocab_size": 153600,
}
@dataclass(repr=False)
class Scenario:
attention_backend: str
input_len: int = WINDOW_SIZE - 1
use_cuda_graph: bool = False
def __repr__(self) -> str:
return (
f"attention_backend:{self.attention_backend.lower()}-"
f"input_len:{self.input_len}-"
f"use_cuda_graph:{self.use_cuda_graph}"
)
class TestExaoneMoe(unittest.TestCase):
@parameterized.expand([None, "FP8"])
def test_exaone_moe_sanity(self, quant_algo):
"""Test basic EXAONE-MoE model forward pass with optional quantization."""
config_dict = deepcopy(EXAONE_MOE_CONFIG)
exaone_moe_config = ExaoneMoEConfig.from_dict(config_dict)
if quant_algo:
quant_config = QuantConfig(quant_algo=quant_algo)
else:
quant_config = QuantConfig()
if quant_algo == "FP8" and getSMVersion() < 89:
self.skipTest("This test is not supported in pre-Ada architecture")
dtype = exaone_moe_config.torch_dtype
device = torch.device("cuda")
model_config = ModelConfig(pretrained_config=exaone_moe_config, quant_config=quant_config)
exaone_moe = ExaoneMoeForCausalLM(model_config).to(device)
input_ids = torch.tensor(
[100, 200, 300, 100, 200, 100, 400, 500], dtype=torch.int, device=device
)
context_sequence_lengths = [3, 2, 1]
sequence_lengths = context_sequence_lengths + [1, 1]
past_seen_tokens = [0, 0, 0, 62, 75]
request_ids = list(range(len(sequence_lengths)))
token_nums = (torch.tensor(past_seen_tokens) + torch.tensor(sequence_lengths)).tolist()
prompt_lens = token_nums[:3] + past_seen_tokens[3:]
num_blocks = 100
tokens_per_block = 128
head_dim = exaone_moe.config.hidden_size // exaone_moe.config.num_attention_heads
num_layers = exaone_moe.config.num_hidden_layers
num_kv_heads = exaone_moe.config.num_key_value_heads
max_seq_len = num_blocks * tokens_per_block
batch_size = len(context_sequence_lengths) + 2
if dtype == torch.half:
kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
elif dtype == torch.bfloat16:
kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
else:
raise ValueError("Invalid dtype")
mapping = Mapping(world_size=1, tp_size=1, rank=0)
kv_cache_config = KvCacheConfig(max_tokens=num_blocks * tokens_per_block)
kv_cache_manager = KVCacheManager(
kv_cache_config,
tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
num_layers=num_layers,
num_kv_heads=num_kv_heads,
head_dim=head_dim,
tokens_per_block=tokens_per_block,
max_seq_len=max_seq_len,
max_batch_size=batch_size,
mapping=mapping,
dtype=kv_cache_dtype,
)
kv_cache_manager.add_dummy_requests(request_ids, token_nums)
metadata_cls = get_attention_backend(model_config.attn_backend).Metadata
attn_metadata = metadata_cls(
seq_lens=torch.tensor(sequence_lengths, dtype=torch.int),
num_contexts=len(context_sequence_lengths),
kv_cache_params=KVCacheParams(
use_cache=True,
num_cached_tokens_per_seq=past_seen_tokens,
),
kv_cache_manager=kv_cache_manager,
request_ids=request_ids,
prompt_lens=prompt_lens,
max_num_requests=len(context_sequence_lengths) + 2,
max_num_tokens=8192,
)
position_ids = []
for i, tokens in enumerate(past_seen_tokens):
seq_len = context_sequence_lengths[i] if i < len(context_sequence_lengths) else 1
position_id = torch.arange(tokens, tokens + seq_len, device=input_ids.device)
position_ids.append(position_id)
position_ids = torch.cat(position_ids).unsqueeze(0)
with torch.inference_mode():
attn_metadata.prepare()
logits = exaone_moe.forward(
input_ids=input_ids, position_ids=position_ids, attn_metadata=attn_metadata
)
self.assertEqual(len(past_seen_tokens), logits.shape[0])
with torch.inference_mode():
attn_metadata.prepare()
logits = exaone_moe.forward(
input_ids=input_ids,
position_ids=position_ids,
attn_metadata=attn_metadata,
return_context_logits=True,
)
self.assertEqual(input_ids.shape, logits.shape[:-1])
kv_cache_manager.shutdown()
def test_exaone_moe_moe_layer_config(self):
"""Test that MoE layers are correctly configured."""
config_dict = deepcopy(EXAONE_MOE_CONFIG)
exaone_moe_config = ExaoneMoEConfig.from_dict(config_dict)
device = torch.device("cuda")
model_config = ModelConfig(pretrained_config=exaone_moe_config)
exaone_moe = ExaoneMoeForCausalLM(model_config).to(device)
# Verify MoE layer configuration
is_moe_layer = config_dict["is_moe_layer"]
self.assertEqual(len(is_moe_layer), NUM_HIDDEN_LAYERS)
self.assertFalse(is_moe_layer[0]) # First layer should be dense
for i in range(1, NUM_HIDDEN_LAYERS):
self.assertTrue(is_moe_layer[i]) # Rest should be MoE
# Verify model has correct number of layers
self.assertEqual(len(exaone_moe.model.layers), NUM_HIDDEN_LAYERS)
@parameterized.expand(
[
Scenario(attention_backend="TRTLLM", input_len=WINDOW_SIZE - 2),
Scenario(attention_backend="TRTLLM", input_len=WINDOW_SIZE - 2, use_cuda_graph=True),
],
lambda testcase_func, param_num, param: f"{testcase_func.__name__}[{param.args[0]}]",
)
@torch.no_grad()
def test_exaone_moe_allclose_to_hf(self, scenario: Scenario) -> None:
"""Compare output to HuggingFace implementation."""
if SKIP_EXAONE_MOE_HF_ACCURACY_TEST:
self.skipTest("EXAONE-MoE HF model is not available in this environment")
attention_backend = scenario.attention_backend
metadata_cls = get_attention_backend(attention_backend).Metadata
torch.random.manual_seed(0)
config_dict = deepcopy(EXAONE_MOE_CONFIG)
exaone_moe_config = ExaoneMoEConfig.from_dict(config_dict)
dtype = exaone_moe_config.torch_dtype
device = torch.device("cuda")
hf_exaone_moe = HFExaoneMoEForCausalLM(exaone_moe_config).to(dtype).to(device).eval()
model_config = ModelConfig(
pretrained_config=exaone_moe_config, attn_backend=attention_backend
)
exaone_moe = ExaoneMoeForCausalLM(model_config).to(dtype).to(device)
exaone_moe.load_weights(hf_exaone_moe.state_dict())
exaone_moe.post_load_weights()
num_blocks = 1
tokens_per_block = 128
head_dim = getattr(
exaone_moe.config,
"head_dim",
exaone_moe.config.hidden_size // exaone_moe.config.num_attention_heads,
)
num_layers = exaone_moe.config.num_hidden_layers
num_kv_heads = exaone_moe.config.num_key_value_heads
max_seq_len = num_blocks * tokens_per_block
batch_size = 1
if dtype == torch.half:
kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
elif dtype == torch.bfloat16:
kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
else:
raise ValueError("Invalid dtype")
mapping = Mapping(world_size=1, tp_size=1, rank=0)
kv_cache_config = KvCacheConfig(
enable_block_reuse=False,
enable_partial_reuse=False,
copy_on_partial_reuse=False,
max_attention_window=[int(exaone_moe_config.sliding_window)],
max_tokens=num_blocks * tokens_per_block,
)
kv_cache_manager = KVCacheManager(
kv_cache_config,
tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
num_layers=num_layers,
num_kv_heads=num_kv_heads,
head_dim=head_dim,
tokens_per_block=tokens_per_block,
max_seq_len=max_seq_len,
max_batch_size=batch_size,
mapping=mapping,
dtype=kv_cache_dtype,
)
# Context phase
input_ids = torch.tensor(
[i * 100 for i in range(1, scenario.input_len + 1)], dtype=torch.int32, device=device
)
num_cached_tokens_per_seq = [0]
request_ids = [1]
token_nums = [input_ids.size(-1)]
prompt_lens = [input_ids.size(-1)]
kv_cache_manager.add_dummy_requests(request_ids, token_nums)
attn_metadata = metadata_cls(
seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.int),
num_contexts=1,
kv_cache_params=KVCacheParams(
use_cache=True,
num_cached_tokens_per_seq=num_cached_tokens_per_seq,
),
max_num_requests=1,
max_num_tokens=8192,
kv_cache_manager=kv_cache_manager,
request_ids=request_ids,
prompt_lens=prompt_lens,
)
position_ids = [torch.arange(0, input_ids.size(-1), dtype=torch.int32)]
position_ids = torch.cat(position_ids).unsqueeze(0).cuda()
with torch.inference_mode():
attn_metadata.prepare()
logits = exaone_moe.forward(
input_ids=input_ids, position_ids=position_ids, attn_metadata=attn_metadata
)
ref = hf_exaone_moe.forward(
input_ids=input_ids.unsqueeze(0), position_ids=position_ids, use_cache=True
)
# MoE models may have slightly higher tolerance due to expert routing
torch.testing.assert_close(logits, ref.logits[:, -1].float(), atol=0.5, rtol=0.5)
# Generation phase
gen_input_ids = torch.tensor([600], dtype=torch.int32, device=device)
num_cached_tokens_per_seq = [input_ids.size(-1)]
attn_metadata = metadata_cls(
seq_lens=torch.tensor([gen_input_ids.size(-1)], dtype=torch.int),
num_contexts=0,
kv_cache_params=KVCacheParams(
use_cache=True,
num_cached_tokens_per_seq=num_cached_tokens_per_seq,
),
max_num_requests=1,
max_num_tokens=8192,
kv_cache_manager=kv_cache_manager,
request_ids=request_ids,
prompt_lens=prompt_lens,
)
gen_position_ids = [
torch.arange(
input_ids.size(-1), input_ids.size(-1) + gen_input_ids.size(-1), dtype=torch.int32
)
]
gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
graph_runner = create_mock_cuda_graph_runner(1) if scenario.use_cuda_graph else None
def run_forward(input_ids, position_ids, attn_metadata):
attn_metadata.prepare()
if not scenario.use_cuda_graph:
return exaone_moe.forward(
input_ids=input_ids, position_ids=position_ids, attn_metadata=attn_metadata
)
else:
inputs = {
"input_ids": input_ids,
"position_ids": position_ids,
"attn_metadata": attn_metadata,
}
key = (1, 0, False)
graph_runner.capture(key, lambda inputs: exaone_moe.forward(**inputs), inputs)
for _ in range(2):
attn_metadata.prepare()
logits = graph_runner.replay(key, inputs)
return logits
if scenario.use_cuda_graph:
attn_metadata = attn_metadata.create_cuda_graph_metadata(1)
with torch.inference_mode():
logits = run_forward(
input_ids=gen_input_ids, position_ids=gen_position_ids, attn_metadata=attn_metadata
)
ref = hf_exaone_moe.forward(
input_ids=gen_input_ids.unsqueeze(0),
position_ids=gen_position_ids,
past_key_values=ref.past_key_values,
use_cache=True,
)
torch.testing.assert_close(logits, ref.logits[:, -1].float(), atol=0.5, rtol=0.5)
if graph_runner is not None:
graph_runner.clear()
kv_cache_manager.shutdown()
if __name__ == "__main__":
unittest.main()

View File

@ -1,12 +1,16 @@
import json
import os
import time
from itertools import product
from pathlib import Path
from typing import Generator
import pytest
import torch
from utils.llm_data import llm_models_root
from tensorrt_llm import MultimodalEncoder
from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
from tensorrt_llm.inputs import default_multimodal_input_loader
from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
from tensorrt_llm.llmapi.llm import LLM, SamplingParams
@ -24,56 +28,127 @@ _QWEN_2_5_VL_DIR = llm_models_root() / "Qwen2.5-VL-3B-Instruct"
_QWEN_3_VL_DIR = llm_models_root() / "Qwen3" / "Qwen3-VL-2B-Instruct"
# TODO: Add multi-image in single chat test
@pytest.mark.parametrize("model_dir",
[_LLAVA_DIR, _QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR])
@pytest.mark.parametrize("pd_disagg", [False, True])
def test_single_image_chat(model_dir, pd_disagg):
"""Test processing single image using encoder (pass mm_embeddings) + LLM API.
@pytest.mark.parametrize(
"prompts,expected_num_duplicates",
[
# Full reuse: same media + same prompts
# All blocks are reused, thus no duplicates
(["Describe the natural environment in the image."] * 2, 0),
# Partial reuse: same media + different prompts
# Prefix blocks are reused, thus 2 duplicates
([
"Describe the natural environment in the image.",
"What objects can you see in the image?",
"Describe the weather in the image.",
], 2),
])
def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates):
"""Test mm_keys in KV cache events with cache reuse scenarios.
This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
results to standard llm generation (pass raw image) by comparing outputs.
This test verifies:
1. KV cache events contain mm_keys for multimodal blocks
2. mm_keys have the expected structure (hash + start_offset)
3. Cache reuse behavior based on media and prompts:
- Same media + same prompts: full reuse (0 duplicate offsets)
- Same media + different prompts: partial reuse (prefix blocks reused)
"""
encoder_model_dir = _LLAVA_DIR
# Test configuration
max_tokens = 64
max_tokens = 16
free_gpu_memory_fraction = 0.2
max_batch_size = 1
# Test data - OpenAI chat completion format
prompts = ["Describe the natural environment in the image."]
media = [example_images[0]]
# Use same image for all prompts
media = [example_images[0]] * len(prompts)
# Sampling configuration
sampling_params = SamplingParams(max_tokens=max_tokens)
kv_cache_config = KvCacheConfig(
enable_block_reuse=False,
enable_block_reuse=True,
free_gpu_memory_fraction=free_gpu_memory_fraction,
event_buffer_max_size=1024, # Enable KV cache events
)
llm = LLM(model=encoder_model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
max_batch_size=1)
inputs = _load_inputs(llm, prompts, media)
with llm:
# Generate for each input separately to test KV cache reuse
for inp in inputs:
_ = llm.generate([inp], sampling_params=sampling_params)
time.sleep(0.5) # Wait for events to be dispatched
events = llm.get_kv_cache_events(10)
# Extract mm_keys offsets from stored events
mm_keys_offsets = []
for event in events:
if event and event.get("data", {}).get("type") == "stored":
for block in event["data"].get("blocks", []):
if block.get("mm_keys"):
for mm_key in block["mm_keys"]:
assert "hash" in mm_key, "mm_key should have 'hash' field"
assert "start_offset" in mm_key, "mm_key should have 'start_offset' field"
mm_keys_offsets.append(mm_key["start_offset"])
num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets))
assert num_duplicates == expected_num_duplicates, (
f"Expected {expected_num_duplicates} duplicate mm_keys offsets, "
f"got {num_duplicates}. Offsets: {mm_keys_offsets}")
@pytest.fixture(scope="module",
params=[_LLAVA_DIR, _QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR],
ids=["llava_7b", "qwen2.5_3b", "qwen3_2b"])
def model_dir(request) -> Path:
return request.param
@pytest.fixture(scope="module", params=[False, True])
def pd_disagg(request) -> bool:
return request.param
@pytest.fixture(scope="module")
def llms(model_dir: Path,
pd_disagg: bool) -> Generator[tuple[LLM, LLM | None], None, None]:
"""Get LLM for prefill and, if disagg, separate LLM for decode."""
free_gpu_memory_fraction = 0.2
disable_overlap_scheduler = pd_disagg
cache_transceiver_cfg = CacheTransceiverConfig(
backend="DEFAULT") if pd_disagg else None
kv_cache_config = KvCacheConfig(
enable_block_reuse=False, # Disable for output 1:1 matching check
free_gpu_memory_fraction=free_gpu_memory_fraction,
)
# Process multimodal data using encoder (pass mm_embeddings)
encoder = MultimodalEncoder(model=model_dir, max_batch_size=max_batch_size)
llm = LLM(
model=model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
trust_remote_code=True,
cache_transceiver_config=cache_transceiver_cfg,
disable_overlap_scheduler=disable_overlap_scheduler,
max_batch_size=1, # fix batch size to reduce non-determinism in tests
)
with llm:
if pd_disagg:
llm_decode = LLM(
model=model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
trust_remote_code=True,
cache_transceiver_config=cache_transceiver_cfg,
)
with llm_decode:
yield (llm, llm_decode)
else:
yield (llm, None)
cache_transceiver_cfg = CacheTransceiverConfig(
backend="DEFAULT") if pd_disagg else None
disable_overlap_scheduler = pd_disagg
llm = LLM(model=model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
trust_remote_code=True,
cache_transceiver_config=cache_transceiver_cfg,
disable_overlap_scheduler=disable_overlap_scheduler)
llm_decode = None
if pd_disagg:
llm_decode = LLM(model=model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
trust_remote_code=True,
cache_transceiver_config=cache_transceiver_cfg)
def _load_inputs(llm: LLM, prompts, media, mm_embeddings=None):
# Load model configuration
config_path = os.path.join(llm._hf_model_dir, 'config.json')
assert os.path.exists(
@ -90,11 +165,42 @@ def test_single_image_chat(model_dir, pd_disagg):
modality="image",
prompts=prompts,
media=media,
mm_embeddings=mm_embeddings,
image_data_format="pt")
# Validate inputs structure
assert len(inputs) == len(
prompts), f"Expected {len(prompts)} inputs, got {len(inputs)}"
return inputs
# TODO: Add multi-image in single chat test
@pytest.mark.threadleak(enabled=False)
def test_single_image_chat(
pd_disagg: bool,
model_dir: Path,
llms: tuple[LLM, LLM | None],
):
"""Test processing single image using encoder (pass mm_embeddings) + LLM API.
This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
results to standard llm generation (pass raw image) by comparing outputs.
"""
llm, llm_decode = llms
# Test configuration
max_tokens = 64
max_batch_size = 1
# Test data - OpenAI chat completion format
prompts = ["Describe the natural environment in the image."]
media = [example_images[0]]
# Sampling configuration
sampling_params = SamplingParams(max_tokens=max_tokens)
# Prepare multimodal inputs
inputs = _load_inputs(llm, prompts, media)
# Generate reference output with raw multimodal inputs
outputs_ref = llm.generate(inputs, sampling_params=sampling_params)
@ -109,33 +215,35 @@ def test_single_image_chat(model_dir, pd_disagg):
) > 0, f"Reference generation has no output text for input {i}"
# Prepare inputs for llm (pass mm_embeddings)
encoder_outputs = encoder.generate(inputs)
# Process multimodal data using encoder (pass mm_embeddings)
encoder = MultimodalEncoder(model=model_dir, max_batch_size=max_batch_size)
with encoder:
encoder_outputs = encoder.generate(inputs)
# Generate output using llm (pass mm_embeddings)
ep_disaggregated_params = encoder_outputs[0].disaggregated_params
# Generate output using llm (pass mm_embeddings)
ep_disaggregated_params = encoder_outputs[0].disaggregated_params
assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
outputs = llm.generate(inputs,
sampling_params=sampling_params,
disaggregated_params=ep_disaggregated_params)
outputs = llm.generate(inputs,
sampling_params=sampling_params,
disaggregated_params=ep_disaggregated_params)
if pd_disagg:
# Generation using llm_decode
assert len(outputs) == 1
pd_disaggregated_params = outputs[0].disaggregated_params
pd_disaggregated_params.request_type = "generation_only"
sampling_params = SamplingParams(max_tokens=max_tokens)
# remove multimodal data from input as decoder worker doesn't need it
inputs[0]['multi_modal_data'] = None
# use prompt token ids from encoder output
inputs[0]['prompt_token_ids'] = outputs[0].prompt_token_ids
if pd_disagg:
# Generation using llm_decode
assert len(outputs) == 1
pd_disaggregated_params = outputs[0].disaggregated_params
pd_disaggregated_params.request_type = "generation_only"
sampling_params = SamplingParams(max_tokens=max_tokens)
# remove multimodal data from input as decoder worker doesn't need it
inputs[0]['multi_modal_data'] = None
# use prompt token ids from encoder output
inputs[0]['prompt_token_ids'] = outputs[0].prompt_token_ids
outputs = llm_decode.generate(
inputs,
sampling_params=sampling_params,
disaggregated_params=pd_disaggregated_params)
outputs = llm_decode.generate(
inputs,
sampling_params=sampling_params,
disaggregated_params=pd_disaggregated_params)
# Validate outputs
assert len(outputs) == len(
@ -175,24 +283,37 @@ def test_single_image_chat(model_dir, pd_disagg):
f"Log probabilities don't match for output {i}, generation {j}"
@pytest.mark.parametrize(
"model_dir, encoder_max_batch_size",
[
(_LLAVA_DIR, 3),
# Qwen2.5 VL's vision encoder seems to output different embeddings based on this value.
# The test only passes with this set to 1.
(_QWEN_2_5_VL_DIR, 1),
(_QWEN_3_VL_DIR, 3),
],
)
def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
@pytest.mark.parametrize("use_mm_embeddings,pass_embeddings_through_loader",
product([False, True], [False, True]))
@pytest.mark.threadleak(enabled=False)
def test_multi_request_batch_chat(
model_dir: Path,
llms: tuple[LLM, LLM | None],
use_mm_embeddings: bool,
pass_embeddings_through_loader: bool,
):
"""Test batching multiple multimodal requests and verify encoder path matches raw path.
This mirrors test_single_image_chat but with a batch of size 3.
This mirrors test_single_image_chat but with a batch of size 3. It also tests passing
embeddings alongside the prompt ("multi_modal_embeddings"), as well as the embedding
handling within default_multimodal_input_loader.
"""
if use_mm_embeddings and model_dir in [_QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR]:
pytest.skip("Qwen does not implement attach_multimodal_embeddings")
# Qwen2.5/3 VL's vision encoder seems to output different embeddings based on this value.
# The test only passes with this set to 1.
encoder_max_batch_size = (1 if model_dir
in [_QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR] else 3)
llm, llm_decode = llms
if llm_decode is not None:
pytest.skip("Disagg support not implemented in test case")
if pass_embeddings_through_loader and not use_mm_embeddings:
pytest.skip("Redundant test configuration")
max_tokens = 64
free_gpu_memory_fraction = 0.6
prompts = [
"Describe the natural environment in the image.",
@ -202,37 +323,8 @@ def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
media = [example_images[0], example_images[1], example_images[2]]
sampling_params = SamplingParams(max_tokens=max_tokens)
kv_cache_config = KvCacheConfig(
enable_block_reuse=
False, # Disable block reuse for output 1-1 matching check
free_gpu_memory_fraction=free_gpu_memory_fraction,
)
encoder = MultimodalEncoder(model=model_dir,
max_batch_size=encoder_max_batch_size)
llm = LLM(
model=model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
max_batch_size=1, # fix batch size to reduce non-determinism in tests
trust_remote_code=True)
config_path = os.path.join(llm._hf_model_dir, 'config.json')
assert os.path.exists(
config_path), f"Model config not found at {config_path}"
with open(config_path, 'r') as f:
model_config = json.load(f)
model_type = model_config['model_type']
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
model_dir=llm._hf_model_dir,
model_type=model_type,
modality="image",
prompts=prompts,
media=media,
image_data_format="pt")
assert len(inputs) == len(
prompts), f"Expected {len(prompts)} inputs, got {len(inputs)}"
inputs = _load_inputs(llm, prompts, media)
# Reference with raw inputs
outputs_ref = llm.generate(inputs, sampling_params=sampling_params)
@ -242,107 +334,74 @@ def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
output.outputs
) > 0, f"Reference generation has no output text for input {i}"
# Encoder path
encoder_outputs = encoder.generate(inputs)
for eo in encoder_outputs:
eo.disaggregated_params.request_type = "context_and_generation"
outputs = llm.generate(inputs,
sampling_params=sampling_params,
disaggregated_params=[
eo.disaggregated_params for eo in encoder_outputs
])
encoder = MultimodalEncoder(model=model_dir,
max_batch_size=encoder_max_batch_size)
with encoder:
# Encoder path
encoder_outputs = encoder.generate(inputs)
if use_mm_embeddings:
for input, encoder_output in zip(inputs, encoder_outputs):
mm_embed_handle = encoder_output.mm_embedding_handle
assert mm_embed_handle is not None
mm_embed = SharedTensorContainer.from_dict(
mm_embed_handle).get_local_view()
input["multi_modal_embeddings"] = {"image": mm_embed}
assert len(outputs) == len(prompts)
for i, output in enumerate(outputs):
assert len(
output.outputs) > 0, f"generation has no output text for input {i}"
if pass_embeddings_through_loader:
# Test embedding support in default_multimodal_input_loader
inputs_with_embeddings = _load_inputs(
llm,
prompts,
media=None,
mm_embeddings=[
input["multi_modal_embeddings"]["image"]
for input in inputs
],
)
for input, input_with_embedding in zip(inputs,
inputs_with_embeddings):
assert isinstance(input, dict)
assert isinstance(input_with_embedding, dict)
assert list(
set(input.keys())
^ set(input_with_embedding.keys())) == [
"multi_modal_data"
]
assert set(input_with_embedding.keys()) == set(
["prompt", "multi_modal_embeddings"])
assert input["prompt"] == input_with_embedding["prompt"]
assert list(
input["multi_modal_embeddings"].keys()) == ["image"]
assert list(input_with_embedding["multi_modal_embeddings"].
keys()) == ["image"]
mm_embed, = input_with_embedding["multi_modal_embeddings"][
"image"]
torch.testing.assert_close(
mm_embed, input["multi_modal_embeddings"]["image"])
inputs = inputs_with_embeddings # perform inference with embeddings returned by input loader
# Compare
for i, (ref_output, test_output) in enumerate(zip(outputs_ref, outputs)):
assert len(ref_output.outputs) == len(test_output.outputs), \
f"Number of generated outputs don't match for output {i}: {len(ref_output.outputs)} vs {len(test_output.outputs)}"
for j, (ref_gen, test_gen) in enumerate(
zip(ref_output.outputs, test_output.outputs)):
assert ref_gen.text == test_gen.text, \
f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}"
extra_kwargs = {}
else:
for eo in encoder_outputs:
eo.disaggregated_params.request_type = "context_and_generation"
extra_kwargs = dict(disaggregated_params=[
eo.disaggregated_params for eo in encoder_outputs
])
outputs = llm.generate(inputs,
sampling_params=sampling_params,
**extra_kwargs)
assert len(outputs) == len(prompts)
for i, output in enumerate(outputs):
assert len(output.outputs
) > 0, f"generation has no output text for input {i}"
@pytest.mark.parametrize(
"prompts,expected_num_duplicates",
[
# Full reuse: same media + same prompts
# All blocks are reused, thus no duplicates
(["Describe the natural environment in the image."] * 2, 0),
# Partial reuse: same media + different prompts
# Prefix blocks are reused, thus 2 duplicates
([
"Describe the natural environment in the image.",
"What objects can you see in the image?",
"Describe the weather in the image.",
], 2),
])
def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates):
"""Test mm_keys in KV cache events with cache reuse scenarios.
This test verifies:
1. KV cache events contain mm_keys for multimodal blocks
2. mm_keys have the expected structure (hash + start_offset)
3. Cache reuse behavior based on media and prompts:
- Same media + same prompts: full reuse (0 duplicate offsets)
- Same media + different prompts: partial reuse (prefix blocks reused)
"""
encoder_model_dir = _LLAVA_DIR
max_tokens = 16
free_gpu_memory_fraction = 0.6
# Use same image for all prompts
media = [example_images[0]] * len(prompts)
sampling_params = SamplingParams(max_tokens=max_tokens)
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
free_gpu_memory_fraction=free_gpu_memory_fraction,
event_buffer_max_size=1024, # Enable KV cache events
)
llm = LLM(model=encoder_model_dir,
backend='pytorch',
kv_cache_config=kv_cache_config,
max_batch_size=1)
config_path = os.path.join(llm._hf_model_dir, 'config.json')
with open(config_path, 'r') as f:
model_config = json.load(f)
model_type = model_config['model_type']
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
model_dir=llm._hf_model_dir,
model_type=model_type,
modality="image",
prompts=prompts,
media=media,
image_data_format="pt")
# Generate for each input separately to test KV cache reuse
for inp in inputs:
_ = llm.generate([inp], sampling_params=sampling_params)
time.sleep(0.5) # Wait for events to be dispatched
events = llm.get_kv_cache_events(10)
# Extract mm_keys offsets from stored events
mm_keys_offsets = []
for event in events:
if event and event.get("data", {}).get("type") == "stored":
for block in event["data"].get("blocks", []):
if block.get("mm_keys"):
for mm_key in block["mm_keys"]:
assert "hash" in mm_key, "mm_key should have 'hash' field"
assert "start_offset" in mm_key, "mm_key should have 'start_offset' field"
mm_keys_offsets.append(mm_key["start_offset"])
num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets))
assert num_duplicates == expected_num_duplicates, (
f"Expected {expected_num_duplicates} duplicate mm_keys offsets, "
f"got {num_duplicates}. Offsets: {mm_keys_offsets}")
# Compare
for i, (ref_output, test_output) in enumerate(zip(outputs_ref,
outputs)):
assert len(ref_output.outputs) == len(test_output.outputs), \
f"Number of generated outputs don't match for output {i}: {len(ref_output.outputs)} vs {len(test_output.outputs)}"
for j, (ref_gen, test_gen) in enumerate(
zip(ref_output.outputs, test_output.outputs)):
assert ref_gen.text == test_gen.text, \
f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}"