[https://nvbugs/5863392][fix] fix partial reuse disabled for disagg (#11247)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
2026-02-16 15:55:08 +08:00 · 2026-02-06 11:23:51 -08:00 · 2026-02-06 11:23:51 -08:00 · 18e611da77
commit 18e611da77
parent f9eed3ecc2
16 changed files with 85 additions and 31 deletions
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@ -891,6 +891,11 @@ public:
        return mIsSWA;
    }

+    [[nodiscard]] bool isEnablePartialReuse() const
+    {
+        return mEnablePartialReuse;
+    }
+
    [[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);

    //! \brief Unpin blocks by block ids directly
@ -1078,6 +1083,11 @@ public:
        return mIndexerKCacheIndexHeadDim;
    }

+    [[nodiscard]] bool isEnablePartialReuse() const
+    {
+        return mWindowBlockManagers.begin()->second.isEnablePartialReuse();
+    }
+
    BlockManager(BlockManager const&) = delete;
    BlockManager& operator=(BlockManager const&) = delete;

@ -1565,6 +1575,8 @@ public:

    [[nodiscard]] virtual bool isEnableBlockReuse() const = 0;

+    [[nodiscard]] virtual bool isEnablePartialReuse() const = 0;
+
    [[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
    [[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
    [[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
@ -1912,6 +1924,11 @@ public:
        return mEnableBlockReuse;
    }

+    [[nodiscard]] bool isEnablePartialReuse() const override
+    {
+        return mBlockManager.isEnablePartialReuse();
+    }
+
    [[nodiscard]] bool isEnableIndexerKCache() const override
    {
        return mBlockManager.isEnableIndexerKCache();
--- a/cpp/include/tensorrt_llm/executor/dataTransceiverState.h
+++ b/cpp/include/tensorrt_llm/executor/dataTransceiverState.h
@ -51,7 +51,8 @@ public:
    CacheState(ModelConfig modelConfig, runtime::WorldConfig const& worldConfig,
        std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableBlockReuse = false,
-        bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
+        bool enablePartialReuse = false, bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0,
+        SizeType32 indexerKCacheQuantBlockSize = 128)
        : mModelConfig(std::move(modelConfig))
        , mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
              worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
@ -60,6 +61,7 @@ public:
        , mAttentionConfig(attentionType, kvFactor)
    {
        mEnableBlockReuse = enableBlockReuse;
+        mEnablePartialReuse = enablePartialReuse;
        mHasIndexerKCache = hasIndexerKCache;
        mIndexerDimPerHead = indexerDimPerHead;
        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
@ -69,8 +71,8 @@ public:
        SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
        std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
-        SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
+        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool enablePartialReuse = false,
+        bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
        : mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
        , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
              attentionLayerNumPerPP}
@ -78,6 +80,7 @@ public:
        , mAttentionConfig(attentionType, kvFactor)
    {
        mEnableBlockReuse = enableBlockReuse;
+        mEnablePartialReuse = enablePartialReuse;
        mHasIndexerKCache = hasIndexerKCache;
        mIndexerDimPerHead = indexerDimPerHead;
        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
@ -87,8 +90,8 @@ public:
        SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
        std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool hasIndexerKCache = false,
-        SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
+        int DPrank = 0, int DPsize = 0, bool enableBlockReuse = false, bool enablePartialReuse = false,
+        bool hasIndexerKCache = false, SizeType32 indexerDimPerHead = 0, SizeType32 indexerKCacheQuantBlockSize = 128)
        : mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
        , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
              attentionLayerNumPerPP}
@ -96,6 +99,7 @@ public:
        , mAttentionConfig(attentionType, kvFactor)
    {
        mEnableBlockReuse = enableBlockReuse;
+        mEnablePartialReuse = enablePartialReuse;
        mHasIndexerKCache = hasIndexerKCache;
        mIndexerDimPerHead = indexerDimPerHead;
        mIndexerKCacheQuantBlockSize = indexerKCacheQuantBlockSize;
@ -186,6 +190,11 @@ public:
        return mEnableBlockReuse;
    }

+    [[nodiscard]] bool getEnablePartialReuse() const
+    {
+        return mEnablePartialReuse;
+    }
+
    [[nodiscard]] bool getHasIndexerKCache() const
    {
        return mHasIndexerKCache;
@ -221,6 +230,7 @@ public:
        sstring << "dpRank:" << mParallelConfig.mDPrank << "\n";
        sstring << "dpSize:" << mParallelConfig.mDPsize << "\n";
        sstring << "enableBlockReuse:" << mEnableBlockReuse << "\n";
+        sstring << "enablePartialReuse:" << mEnablePartialReuse << "\n";
        sstring << "hasIndexerKCache:" << mHasIndexerKCache << "\n";
        sstring << "indexerDimPerHead:" << mIndexerDimPerHead << "\n";
        sstring << "indexerKCacheQuantBlockSize:" << mIndexerKCacheQuantBlockSize << "\n";
@ -234,6 +244,7 @@ private:
    nvinfer1::DataType mDataType;
    AttentionConfig mAttentionConfig;
    bool mEnableBlockReuse{false};
+    bool mEnablePartialReuse{false};
    bool mHasIndexerKCache{false};
    SizeType32 mIndexerDimPerHead{0};
    SizeType32 mIndexerKCacheQuantBlockSize{128};
--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@ -50,7 +50,8 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest

    // Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is
    // distributed among CP ranks. So, we transfer all blocks from send side.
-    if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
+    if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || !cacheManager->isEnablePartialReuse()
+        || lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
    {
        // disable reuse path, and vwsa don't support reuse.
        bool needSendAllForWindow = common::getEnvKVCacheTransferAllBlocksForWindow();
@ -87,13 +88,13 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest
    return BlockRange::fromReuseTree(*cacheManager, lastBlockKey, indexFromEnd);
 }

-BlockRange getBlockRangeForReceiving(
-    BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse, bool recvSideHasCP)
+BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
+    bool srcEnableBlockReuse, bool srcEnablePartialReuse, bool recvSideHasCP)
 {
    // Note: When recv side has CP, we request all blocks from send side right now.
    auto poolNum = cacheManager->getBlockManager().getNumPools(
        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
-    if (poolNum == 1 && srcEnableBlockReuse && !recvSideHasCP)
+    if (poolNum == 1 && srcEnableBlockReuse && srcEnablePartialReuse && !recvSideHasCP)
    {
        // Build from all block ids, then slice off the reused blocks so we only transfer newly allocated ones.
        auto windowSize = cacheManager->getBlockManager().getWindowSizesMetadata().begin()->first;
@ -555,7 +556,8 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
    auto const& destConfig = session.getOtherState().getCacheState().value();
    auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
    auto& bufferManager = session.getBufferManager();
-    auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse());
+    auto blockRange = getBlockRangeForReceiving(
+        mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), destConfig.getEnablePartialReuse());

    auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);

--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
@ -53,7 +53,7 @@ using CacheTransBufferManager = kv_cache_manager::CacheTransBufferManager;
 using BlockRange = kv_cache_manager::BlockRange;

 BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
-    bool srcEnableBlockReuse, bool recvSideHasCP = false);
+    bool srcEnableBlockReuse, bool srcEnablePartialReuse, bool recvSideHasCP = false);

 // Used to support the cache transmission with different layouts and different protocols.
 class BaseCacheFormatter
--- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@ -143,8 +143,8 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
    }
    mCacheState = std::make_unique<executor::kv_cache::CacheState>(cacheStateModelCfg, worldConfig,
        attentionLayerNumPerPP, dataType, attentionType, kvFactor, cacheManager->isEnableBlockReuse(),
-        cacheManager->isEnableIndexerKCache(), cacheManager->getIndexerKCacheIndexHeadDim(),
-        cacheManager->getIndexerKCacheQuantBlockSize());
+        cacheManager->isEnablePartialReuse(), cacheManager->isEnableIndexerKCache(),
+        cacheManager->getIndexerKCacheIndexHeadDim(), cacheManager->getIndexerKCacheQuantBlockSize());

    if (mCacheState->getParallelConfig().mEnableAttentionDP)
    {
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@ -825,8 +825,8 @@ public:
        {
            auto* cacheManager = mFormatter->getCacheManager();
            auto beam = 0;
-            auto requestedBlockRange
-                = getBlockRangeForReceiving(cacheManager, llmRequest, destCacheState.getEnableBlockReuse());
+            auto requestedBlockRange = getBlockRangeForReceiving(
+                cacheManager, llmRequest, destCacheState.getEnableBlockReuse(), destCacheState.getEnablePartialReuse());

            auto const& uniqueTokens = llmRequest.getUniqueTokens(beam);
            auto lastBlockKey
--- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@ -357,8 +357,8 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
    auto& bufferManager = session.getBufferManager();
    auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
    bool const recvSideHasCP = selfConfig.getParallelConfig().mContextParallelism > 1;
-    auto blockRange
-        = getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), recvSideHasCP);
+    auto blockRange = getBlockRangeForReceiving(
+        mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), destConfig.getEnablePartialReuse(), recvSideHasCP);
    auto const numPools = mCacheManager->getBlockManager().getNumPools(
        /*includeBlockScalePools=*/false, /*includeIndexerKCachePools=*/false);
    auto const& windowSizes = blockRange.getWindowSizes();
--- a/cpp/tensorrt_llm/executor/serialization.cpp
+++ b/cpp/tensorrt_llm/executor/serialization.cpp
@ -544,12 +544,13 @@ kv_cache::CacheState Serialization::deserializeCacheState(std::istream& is)
    auto attentionType = su::deserialize<decltype(CacheState::AttentionConfig::mAttentionType)>(is);
    auto kvFactor = su::deserialize<decltype(CacheState::AttentionConfig::mKvFactor)>(is);
    auto enableBlockReuse = su::deserialize<bool>(is);
+    auto enablePartialReuse = su::deserialize<bool>(is);
    auto hasIndexerKCache = su::deserialize<bool>(is);
    auto indexerDimPerHead = su::deserialize<decltype(CacheState::ModelConfig::mSizePerHead)>(is);
    auto indexerKCacheQuantBlockSize = su::deserialize<decltype(CacheState::ModelConfig::mTokensPerBlock)>(is);
    return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism,
        contextParallelism, attentionLayerNumPerPP, dataType, attentionType, kvFactor, enableAttentionDP, DPrank,
-        DPsize, enableBlockReuse, hasIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize};
+        DPsize, enableBlockReuse, enablePartialReuse, hasIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize};
 }

 void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& os)
@ -568,6 +569,7 @@ void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& o
    su::serialize(state.mAttentionConfig.mAttentionType, os);
    su::serialize(state.mAttentionConfig.mKvFactor, os);
    su::serialize(state.mEnableBlockReuse, os);
+    su::serialize(state.mEnablePartialReuse, os);
    su::serialize(state.getHasIndexerKCache(), os);
    su::serialize(state.getIndexerDimPerHead(), os);
    su::serialize(state.getIndexerKCacheQuantBlockSize(), os);
@ -590,6 +592,7 @@ size_t Serialization::serializedSize(kv_cache::CacheState const& state)
    totalSize += su::serializedSize(state.mAttentionConfig.mAttentionType);
    totalSize += su::serializedSize(state.mAttentionConfig.mKvFactor);
    totalSize += su::serializedSize(state.mEnableBlockReuse);
+    totalSize += su::serializedSize(state.mEnablePartialReuse);
    totalSize += su::serializedSize(state.getHasIndexerKCache());
    totalSize += su::serializedSize(state.getIndexerDimPerHead());
    totalSize += su::serializedSize(state.getIndexerKCacheQuantBlockSize());
--- a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@ -167,6 +167,11 @@ public:
        NB_OVERRIDE_PURE(isEnableBlockReuse);
    }

+    bool isEnablePartialReuse() const override
+    {
+        NB_OVERRIDE_PURE(isEnablePartialReuse);
+    }
+
    void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override
    {
        NB_OVERRIDE_PURE(rewindKVCache, requestId, rewindLengths);
--- a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
+++ b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
@ -1296,13 +1296,14 @@ TEST(SerializeUtilsTest, CacheStateIndexerKCache)
    int dpRank = 0;
    int dpSize = 1;
    bool enableBlockReuse = true;
+    bool enablePartialReuse = true;
    bool hasIndexerKCache = true;
    texec::SizeType32 indexerDimPerHead = 96;
    texec::SizeType32 indexerKCacheQuantBlockSize = 128;

    CacheState state{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tp, pp, cp, attentionLayerNumPerPP, dataType,
-        attentionType, kvFactor, enableAttentionDP, dpRank, dpSize, enableBlockReuse, hasIndexerKCache,
-        indexerDimPerHead, indexerKCacheQuantBlockSize};
+        attentionType, kvFactor, enableAttentionDP, dpRank, dpSize, enableBlockReuse, enablePartialReuse,
+        hasIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize};

    std::ostringstream oss;
    texec::Serialization::serialize(state, oss);
@ -1320,6 +1321,7 @@ TEST(SerializeUtilsTest, CacheStateIndexerKCache)
    EXPECT_EQ(state.getAttentionConfig().mAttentionType, state2.getAttentionConfig().mAttentionType);
    EXPECT_EQ(state.getAttentionConfig().mKvFactor, state2.getAttentionConfig().mKvFactor);
    EXPECT_EQ(state.getEnableBlockReuse(), state2.getEnableBlockReuse());
+    EXPECT_EQ(state.getEnablePartialReuse(), state2.getEnablePartialReuse());
    EXPECT_EQ(state.getHasIndexerKCache(), state2.getHasIndexerKCache());
    EXPECT_EQ(state.getIndexerDimPerHead(), state2.getIndexerDimPerHead());
    EXPECT_EQ(state.getIndexerKCacheQuantBlockSize(), state2.getIndexerKCacheQuantBlockSize());
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@ -351,6 +351,9 @@ class PyExecutor:
            ResourceManagerType.KV_CACHE_MANAGER)
        self.enable_kv_cache_events = self.kv_cache_manager is not None and self.kv_cache_manager.event_buffer_max_size > 0
        self.enable_kv_cache_reuse = self.kv_cache_manager is not None and self.kv_cache_manager.enable_block_reuse
+        self.enable_partial_reuse_for_disagg = (
+            self.enable_kv_cache_reuse
+            and self.kv_cache_manager.enable_partial_reuse)

        self.max_input_len = max_input_len
        # _executor_loop private data
@ -359,7 +362,7 @@ class PyExecutor:
        self.expected_num_active_requests = 0
        self.async_transfer_manager = AsyncTransferManager(
            self.resource_manager,
-            should_store_blocks=self.enable_kv_cache_reuse
+            should_store_blocks=self.enable_partial_reuse_for_disagg
            and not self.kv_cache_manager.is_vswa)
        self.previous_batch: Optional[BatchState] = None
        self.has_previous_draft_tokens = False
@ -3003,7 +3006,7 @@ class PyExecutor:
                                logger.debug(
                                    f"Request {request.py_request_id} has no avg_decoded_tokens_per_iter"
                                )
-                if self.enable_kv_cache_reuse and not self.kv_cache_manager.is_vswa:
+                if self.enable_partial_reuse_for_disagg and not self.kv_cache_manager.is_vswa:
                    requests_to_terminate.append(request)
                else:
                    if not request.is_disagg_context_transmission_state:
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@ -471,6 +471,7 @@ class KVCacheManager(BaseResourceManager):
        self.num_pools = self.impl.num_pools
        self.max_blocks_per_seq = self.impl.max_blocks_per_seq
        self.enable_block_reuse = kv_cache_config.enable_block_reuse
+        self.enable_partial_reuse = kv_cache_config.enable_partial_reuse
        self.host_kv_cache_block_offsets = torch.empty(self.num_pools,
                                                       max_batch_size *
                                                       max_beam_width,
@ -1711,6 +1712,7 @@ class KVCacheManagerV2(BaseResourceManager):
            self.max_seq_len = max_num_tokens

        self.enable_block_reuse = kv_cache_config.enable_block_reuse
+        self.enable_partial_reuse = kv_cache_config.enable_partial_reuse

        # Plus 1 for cuda graph dummy request
        self.index_mapper = IndexMapper(max_batch_size + 1, max_beam_width)
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@ -1273,20 +1273,27 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
    @skip_pre_hopper
    @pytest.mark.skip_less_device(2)
    @pytest.mark.parametrize("overlap_scheduler", [False, True])
-    def test_auto_dtype(self, overlap_scheduler):
+    @pytest.mark.parametrize("enable_partial_reuse", [True, False])
+    def test_auto_dtype(self, overlap_scheduler, enable_partial_reuse):
+        kv_cache_config = {
+            "enable_block_reuse": True,
+            "enable_partial_reuse": enable_partial_reuse,
+        }
        ctx_server_config = {
            "disable_overlap_scheduler": True,
            "cuda_graph_config": None,
            "cache_transceiver_config": {
                "backend": "DEFAULT"
-            }
+            },
+            "kv_cache_config": kv_cache_config,
        }
        gen_server_config = {
            "disable_overlap_scheduler": overlap_scheduler,
            "cuda_graph_config": None,
            "cache_transceiver_config": {
                "backend": "DEFAULT"
-            }
+            },
+            "kv_cache_config": kv_cache_config,
        }
        disaggregated_server_config = {
            "hostname": "localhost",
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@ -332,8 +332,9 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_inst
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-True]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True-True]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False]
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4

--- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt
+++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt
@ -207,8 +207,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_inst
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False]
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-True]

 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@ -25,8 +25,9 @@ l0_dgx_h100:
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True]
  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=False]
-  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
-  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-True]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True-True]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False-False]
  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill
  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend