mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[https://nvbugs/5863392][fix] fix partial reuse disabled for disagg (#11247)
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
This commit is contained in:
parent
f9eed3ecc2
commit
18e611da77
@ -891,6 +891,11 @@ public:
|
||||
return mIsSWA;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isEnablePartialReuse() const
|
||||
{
|
||||
return mEnablePartialReuse;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);
|
||||
|
||||
//! \brief Unpin blocks by block ids directly
|
||||
@ -1078,6 +1083,11 @@ public:
|
||||
return mIndexerKCacheIndexHeadDim;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isEnablePartialReuse() const
|
||||
{
|
||||
return mWindowBlockManagers.begin()->second.isEnablePartialReuse();
|
||||
}
|
||||
|
||||
BlockManager(BlockManager const&) = delete;
|
||||
BlockManager& operator=(BlockManager const&) = delete;
|
||||
|
||||
@ -1565,6 +1575,8 @@ public:
|
||||
|
||||
[[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
|
||||
|
||||
[[nodiscard]] virtual bool isEnablePartialReuse() const = 0;
|
||||
|
||||
[[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
|
||||
[[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
|
||||
[[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
|
||||
@ -1912,6 +1924,11 @@ public:
|
||||
return mEnableBlockReuse;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isEnablePartialReuse() const override
|
||||
{
|
||||
return mBlockManager.isEnablePartialReuse();
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isEnableIndexerKCache() const override
|
||||
{
|
||||
return mBlockManager.isEnableIndexerKCache();
|
||||
|
||||
@ -51,7 +51,8 @@ public:
|
||||
CacheState(ModelConfig modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
|
||||
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableBlockReuse = false,
|
||||
bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
|
||||
bool enablePartialReuse = false, bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0,
|
||||
SizeType32 indexerKCacheQuantBlockSize = 128)
|
||||
: mModelConfig(std::move(modelConfig))
|
||||
, mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
|
||||
worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
|
||||
@ -60,6 +61,7 @@ public:
|
||||
, mAttentionConfig(attentionType, kvFactor)
|
||||
{
|
||||
mEnableBlockReuse = enableBlockReuse;
|
||||
mEnablePartialReuse = enablePartialReuse;
|
||||
mHasIndexerKCache = hasIndexerKCache;
|
||||
mIndexerDimPerHead = indexerDimPerHead;
|
||||
mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
|
||||
@ -69,8 +71,8 @@ public:
|
||||
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
|
||||
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
|
||||
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
|
||||
int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
|
||||
SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
|
||||
int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool enablePartialReuse = false,
|
||||
bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
|
||||
: mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
|
||||
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
|
||||
attentionLayerNumPerPP}
|
||||
@ -78,6 +80,7 @@ public:
|
||||
, mAttentionConfig(attentionType, kvFactor)
|
||||
{
|
||||
mEnableBlockReuse = enableBlockReuse;
|
||||
mEnablePartialReuse = enablePartialReuse;
|
||||
mHasIndexerKCache = hasIndexerKCache;
|
||||
mIndexerDimPerHead = indexerDimPerHead;
|
||||
mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
|
||||
@ -87,8 +90,8 @@ public:
|
||||
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
|
||||
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
|
||||
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
|
||||
int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
|
||||
SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
|
||||
int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool enablePartialReuse = false,
|
||||
bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
|
||||
: mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
|
||||
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
|
||||
attentionLayerNumPerPP}
|
||||
@ -96,6 +99,7 @@ public:
|
||||
, mAttentionConfig(attentionType, kvFactor)
|
||||
{
|
||||
mEnableBlockReuse = enableBlockReuse;
|
||||
mEnablePartialReuse = enablePartialReuse;
|
||||
mHasIndexerKCache = hasIndexerKCache;
|
||||
mIndexerDimPerHead = indexerDimPerHead;
|
||||
mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
|
||||
@ -186,6 +190,11 @@ public:
|
||||
return mEnableBlockReuse;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool getEnablePartialReuse() const
|
||||
{
|
||||
return mEnablePartialReuse;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool getHasIndexerKCache() const
|
||||
{
|
||||
return mHasIndexerKCache;
|
||||
@ -221,6 +230,7 @@ public:
|
||||
sstring << "dpRank:" << mParallelConfig.mDPrank << "\n";
|
||||
sstring << "dpSize:" << mParallelConfig.mDPsize << "\n";
|
||||
sstring << "enableBlockReuse:" << mEnableBlockReuse << "\n";
|
||||
sstring << "enablePartialReuse:" << mEnablePartialReuse << "\n";
|
||||
sstring << "hasIndexerKCache:" << mHasIndexerKCache << "\n";
|
||||
sstring << "indexerDimPerHead:" << mIndexerDimPerHead << "\n";
|
||||
sstring << "indexerKCacheQuantBlockSize:" << mIndexerKCacheQuantBlockSize << "\n";
|
||||
@ -234,6 +244,7 @@ private:
|
||||
nvinfer1::DataType mDataType;
|
||||
AttentionConfig mAttentionConfig;
|
||||
bool mEnableBlockReuse{false};
|
||||
bool mEnablePartialReuse{false};
|
||||
bool mHasIndexerKCache{false};
|
||||
SizeType32 mIndexerDimPerHead{0};
|
||||
SizeType32 mIndexerKCacheQuantBlockSize{128};
|
||||
|
||||
@ -50,7 +50,8 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest
|
||||
|
||||
// Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is
|
||||
// distributed among CP ranks. So, we transfer all blocks from send side.
|
||||
if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
|
||||
if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || !cacheManager->isEnablePartialReuse()
|
||||
|| lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
|
||||
{
|
||||
// disable reuse path, and vwsa don't support reuse.
|
||||
bool needSendAllForWindow = common::getEnvKVCacheTransferAllBlocksForWindow();
|
||||
@ -87,13 +88,13 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest
|
||||
return BlockRange::fromReuseTree(*cacheManager, lastBlockKey, indexFromEnd);
|
||||
}
|
||||
|
||||
BlockRange getBlockRangeForReceiving(
|
||||
BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse, bool recvSideHasCP)
|
||||
BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
|
||||
bool srcEnableBlockReuse, bool srcEnablePartialReuse, bool recvSideHasCP)
|
||||
{
|
||||
// Note: When recv side has CP, we request all blocks from send side right now.
|
||||
auto poolNum = cacheManager->getBlockManager().getNumPools(
|
||||
/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
|
||||
if (poolNum == 1 && srcEnableBlockReuse && !recvSideHasCP)
|
||||
if (poolNum == 1 && srcEnableBlockReuse && srcEnablePartialReuse && !recvSideHasCP)
|
||||
{
|
||||
// Build from all block ids, then slice off the reused blocks so we only transfer newly allocated ones.
|
||||
auto windowSize = cacheManager->getBlockManager().getWindowSizesMetadata().begin()->first;
|
||||
@ -555,7 +556,8 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
|
||||
auto const& destConfig = session.getOtherState().getCacheState().value();
|
||||
auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
|
||||
auto& bufferManager = session.getBufferManager();
|
||||
auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse());
|
||||
auto blockRange = getBlockRangeForReceiving(
|
||||
mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), destConfig.getEnablePartialReuse());
|
||||
|
||||
auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ using CacheTransBufferManager = kv_cache_manager::CacheTransBufferManager;
|
||||
using BlockRange = kv_cache_manager::BlockRange;
|
||||
|
||||
BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
|
||||
bool srcEnableBlockReuse, bool recvSideHasCP = false);
|
||||
bool srcEnableBlockReuse, bool srcEnablePartialReuse, bool recvSideHasCP = false);
|
||||
|
||||
// Used to support the cache transmission with different layouts and different protocols.
|
||||
class BaseCacheFormatter
|
||||
|
||||
@ -143,8 +143,8 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
|
||||
}
|
||||
mCacheState = std::make_unique<executor::kv_cache::CacheState>(cacheStateModelCfg, worldConfig,
|
||||
attentionLayerNumPerPP, dataType, attentionType, kvFactor, cacheManager->isEnableBlockReuse(),
|
||||
cacheManager->isEnableIndexerKCache(), cacheManager->getIndexerKCacheIndexHeadDim(),
|
||||
cacheManager->getIndexerKCacheQuantBlockSize());
|
||||
cacheManager->isEnablePartialReuse(), cacheManager->isEnableIndexerKCache(),
|
||||
cacheManager->getIndexerKCacheIndexHeadDim(), cacheManager->getIndexerKCacheQuantBlockSize());
|
||||
|
||||
if (mCacheState->getParallelConfig().mEnableAttentionDP)
|
||||
{
|
||||
|
||||
@ -825,8 +825,8 @@ public:
|
||||
{
|
||||
auto* cacheManager = mFormatter->getCacheManager();
|
||||
auto beam = 0;
|
||||
auto requestedBlockRange
|
||||
= getBlockRangeForReceiving(cacheManager, llmRequest, destCacheState.getEnableBlockReuse());
|
||||
auto requestedBlockRange = getBlockRangeForReceiving(
|
||||
cacheManager, llmRequest, destCacheState.getEnableBlockReuse(), destCacheState.getEnablePartialReuse());
|
||||
|
||||
auto const& uniqueTokens = llmRequest.getUniqueTokens(beam);
|
||||
auto lastBlockKey
|
||||
|
||||
@ -357,8 +357,8 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
|
||||
auto& bufferManager = session.getBufferManager();
|
||||
auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
|
||||
bool const recvSideHasCP = selfConfig.getParallelConfig().mContextParallelism > 1;
|
||||
auto blockRange
|
||||
= getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), recvSideHasCP);
|
||||
auto blockRange = getBlockRangeForReceiving(
|
||||
mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), destConfig.getEnablePartialReuse(), recvSideHasCP);
|
||||
auto const numPools = mCacheManager->getBlockManager().getNumPools(
|
||||
/*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
|
||||
auto const& windowSizes = blockRange.getWindowSizes();
|
||||
|
||||
@ -544,12 +544,13 @@ kv_cache::CacheState Serialization::deserializeCacheState(std::istream& is)
|
||||
auto attentionType = su::deserialize<decltype(CacheState::AttentionConfig::mAttentionType)>(is);
|
||||
auto kvFactor = su::deserialize<decltype(CacheState::AttentionConfig::mKvFactor)>(is);
|
||||
auto enableBlockReuse = su::deserialize<bool>(is);
|
||||
auto enablePartialReuse = su::deserialize<bool>(is);
|
||||
auto hasIndexerKCache = su::deserialize<bool>(is);
|
||||
auto indexerDimPerHead = su::deserialize<decltype(CacheState::ModelConfig::mSizePerHead)>(is);
|
||||
auto indexerKCacheQuantBlockSize = su::deserialize<decltype(CacheState::ModelConfig::mTokensPerBlock)>(is);
|
||||
return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism,
|
||||
contextParallelism, attentionLayerNumPerPP, dataType, attentionType, kvFactor, enableAttentionDP, DPrank,
|
||||
DPsize, enableBlockReuse, hasIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize};
|
||||
DPsize, enableBlockReuse, enablePartialReuse, hasIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize};
|
||||
}
|
||||
|
||||
void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& os)
|
||||
@ -568,6 +569,7 @@ void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& o
|
||||
su::serialize(state.mAttentionConfig.mAttentionType, os);
|
||||
su::serialize(state.mAttentionConfig.mKvFactor, os);
|
||||
su::serialize(state.mEnableBlockReuse, os);
|
||||
su::serialize(state.mEnablePartialReuse, os);
|
||||
su::serialize(state.getHasIndexerKCache(), os);
|
||||
su::serialize(state.getIndexerDimPerHead(), os);
|
||||
su::serialize(state.getIndexerKCacheQuantBlockSize(), os);
|
||||
@ -590,6 +592,7 @@ size_t Serialization::serializedSize(kv_cache::CacheState const& state)
|
||||
totalSize += su::serializedSize(state.mAttentionConfig.mAttentionType);
|
||||
totalSize += su::serializedSize(state.mAttentionConfig.mKvFactor);
|
||||
totalSize += su::serializedSize(state.mEnableBlockReuse);
|
||||
totalSize += su::serializedSize(state.mEnablePartialReuse);
|
||||
totalSize += su::serializedSize(state.getHasIndexerKCache());
|
||||
totalSize += su::serializedSize(state.getIndexerDimPerHead());
|
||||
totalSize += su::serializedSize(state.getIndexerKCacheQuantBlockSize());
|
||||
|
||||
@ -167,6 +167,11 @@ public:
|
||||
NB_OVERRIDE_PURE(isEnableBlockReuse);
|
||||
}
|
||||
|
||||
bool isEnablePartialReuse() const override
|
||||
{
|
||||
NB_OVERRIDE_PURE(isEnablePartialReuse);
|
||||
}
|
||||
|
||||
void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override
|
||||
{
|
||||
NB_OVERRIDE_PURE(rewindKVCache, requestId, rewindLengths);
|
||||
|
||||
@ -1296,13 +1296,14 @@ TEST(SerializeUtilsTest, CacheStateIndexerKCache)
|
||||
int dpRank = 0;
|
||||
int dpSize = 1;
|
||||
bool enableBlockReuse = true;
|
||||
bool enablePartialReuse = true;
|
||||
bool hasIndexerKCache = true;
|
||||
texec::SizeType32 indexerDimPerHead = 96;
|
||||
texec::SizeType32 indexerKCacheQuantBlockSize = 128;
|
||||
|
||||
CacheState state{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tp, pp, cp, attentionLayerNumPerPP, dataType,
|
||||
attentionType, kvFactor, enableAttentionDP, dpRank, dpSize, enableBlockReuse, hasIndexerKCache,
|
||||
indexerDimPerHead, indexerKCacheQuantBlockSize};
|
||||
attentionType, kvFactor, enableAttentionDP, dpRank, dpSize, enableBlockReuse, enablePartialReuse,
|
||||
hasIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize};
|
||||
|
||||
std::ostringstream oss;
|
||||
texec::Serialization::serialize(state, oss);
|
||||
@ -1320,6 +1321,7 @@ TEST(SerializeUtilsTest, CacheStateIndexerKCache)
|
||||
EXPECT_EQ(state.getAttentionConfig().mAttentionType, state2.getAttentionConfig().mAttentionType);
|
||||
EXPECT_EQ(state.getAttentionConfig().mKvFactor, state2.getAttentionConfig().mKvFactor);
|
||||
EXPECT_EQ(state.getEnableBlockReuse(), state2.getEnableBlockReuse());
|
||||
EXPECT_EQ(state.getEnablePartialReuse(), state2.getEnablePartialReuse());
|
||||
EXPECT_EQ(state.getHasIndexerKCache(), state2.getHasIndexerKCache());
|
||||
EXPECT_EQ(state.getIndexerDimPerHead(), state2.getIndexerDimPerHead());
|
||||
EXPECT_EQ(state.getIndexerKCacheQuantBlockSize(), state2.getIndexerKCacheQuantBlockSize());
|
||||
|
||||
@ -351,6 +351,9 @@ class PyExecutor:
|
||||
ResourceManagerType.KV_CACHE_MANAGER)
|
||||
self.enable_kv_cache_events = self.kv_cache_manager is not None and self.kv_cache_manager.event_buffer_max_size > 0
|
||||
self.enable_kv_cache_reuse = self.kv_cache_manager is not None and self.kv_cache_manager.enable_block_reuse
|
||||
self.enable_partial_reuse_for_disagg = (
|
||||
self.enable_kv_cache_reuse
|
||||
and self.kv_cache_manager.enable_partial_reuse)
|
||||
|
||||
self.max_input_len = max_input_len
|
||||
# _executor_loop private data
|
||||
@ -359,7 +362,7 @@ class PyExecutor:
|
||||
self.expected_num_active_requests = 0
|
||||
self.async_transfer_manager = AsyncTransferManager(
|
||||
self.resource_manager,
|
||||
should_store_blocks=self.enable_kv_cache_reuse
|
||||
should_store_blocks=self.enable_partial_reuse_for_disagg
|
||||
and not self.kv_cache_manager.is_vswa)
|
||||
self.previous_batch: Optional[BatchState] = None
|
||||
self.has_previous_draft_tokens = False
|
||||
@ -3003,7 +3006,7 @@ class PyExecutor:
|
||||
logger.debug(
|
||||
f"Request {request.py_request_id} has no avg_decoded_tokens_per_iter"
|
||||
)
|
||||
if self.enable_kv_cache_reuse and not self.kv_cache_manager.is_vswa:
|
||||
if self.enable_partial_reuse_for_disagg and not self.kv_cache_manager.is_vswa:
|
||||
requests_to_terminate.append(request)
|
||||
else:
|
||||
if not request.is_disagg_context_transmission_state:
|
||||
|
||||
@ -471,6 +471,7 @@ class KVCacheManager(BaseResourceManager):
|
||||
self.num_pools = self.impl.num_pools
|
||||
self.max_blocks_per_seq = self.impl.max_blocks_per_seq
|
||||
self.enable_block_reuse = kv_cache_config.enable_block_reuse
|
||||
self.enable_partial_reuse = kv_cache_config.enable_partial_reuse
|
||||
self.host_kv_cache_block_offsets = torch.empty(self.num_pools,
|
||||
max_batch_size *
|
||||
max_beam_width,
|
||||
@ -1711,6 +1712,7 @@ class KVCacheManagerV2(BaseResourceManager):
|
||||
self.max_seq_len = max_num_tokens
|
||||
|
||||
self.enable_block_reuse = kv_cache_config.enable_block_reuse
|
||||
self.enable_partial_reuse = kv_cache_config.enable_partial_reuse
|
||||
|
||||
# Plus 1 for cuda graph dummy request
|
||||
self.index_mapper = IndexMapper(max_batch_size + 1, max_beam_width)
|
||||
|
||||
@ -1273,20 +1273,27 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
|
||||
@skip_pre_hopper
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.parametrize("overlap_scheduler", [False, True])
|
||||
def test_auto_dtype(self, overlap_scheduler):
|
||||
@pytest.mark.parametrize("enable_partial_reuse", [True, False])
|
||||
def test_auto_dtype(self, overlap_scheduler, enable_partial_reuse):
|
||||
kv_cache_config = {
|
||||
"enable_block_reuse": True,
|
||||
"enable_partial_reuse": enable_partial_reuse,
|
||||
}
|
||||
ctx_server_config = {
|
||||
"disable_overlap_scheduler": True,
|
||||
"cuda_graph_config": None,
|
||||
"cache_transceiver_config": {
|
||||
"backend": "DEFAULT"
|
||||
}
|
||||
},
|
||||
"kv_cache_config": kv_cache_config,
|
||||
}
|
||||
gen_server_config = {
|
||||
"disable_overlap_scheduler": overlap_scheduler,
|
||||
"cuda_graph_config": None,
|
||||
"cache_transceiver_config": {
|
||||
"backend": "DEFAULT"
|
||||
}
|
||||
},
|
||||
"kv_cache_config": kv_cache_config,
|
||||
}
|
||||
disaggregated_server_config = {
|
||||
"hostname": "localhost",
|
||||
|
||||
@ -332,8 +332,9 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_inst
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True-True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4
|
||||
|
||||
|
||||
@ -207,8 +207,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_inst
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-True]
|
||||
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
|
||||
|
||||
@ -25,8 +25,9 @@ l0_dgx_h100:
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True]
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=False]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-True]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True-True]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False]
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill
|
||||
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
|
||||
Loading…
Reference in New Issue
Block a user