From 0a9ddf8c17e5c8bf63cbbd60dc5f07247f89d7d1 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Sun, 15 Feb 2026 10:40:44 +0800
Subject: [PATCH] [https://nvbugs/5880261][fix] fix cacheTransceiver (#11409)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
---
 .../agent_utils/connection.cpp                   |  4 ++--
 .../multi_gpu/cacheTransceiverTest.cpp           |  4 ++--
 .../defs/disaggregated/test_disaggregated.py     | 16 ++++++++--------
 .../test_disaggregated_single_gpu.py             |  6 +++---
 tests/integration/test_lists/waives.txt          |  2 --
 5 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
index 3e9c7485bb..36962720ba 100644
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
@@ -179,8 +179,8 @@ void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& reque
     TLLM_CHECK(deviceId == mAgentConnectionManager->getDeviceId());
     for (size_t i = 0; i < preAllocateBuffers.size(); i++)
     {
-        bufferDescs.emplace_back(
-            reinterpret_cast<uintptr_t>(preAllocateBuffers[i]->data()), preAllocateBuffers[i]->getSize(), deviceId);
+        bufferDescs.emplace_back(reinterpret_cast<uintptr_t>(preAllocateBuffers[i]->data()),
+            preAllocateBuffers[i]->getSizeInBytes(), deviceId);
     }
     std::string address = mAgentConnectionManager->getAgent()->getLocalConnectionInfo();
     std::optional<std::string> metadataOpt = std::nullopt;
diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
index cfda94db2e..7fe546a3ef 100644
--- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
+++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@@ -1606,8 +1606,8 @@ TEST_P(UnexpectedTerminationRaceTest, UnexpectedTerminationRaceTest)
     if (mIsContext || mIsGeneration)
     {
         bool enableDP = mIsContext ? contextDP : generationDP;
-        setUpCacheManager(
-            numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP, isWindow);
+        setUpCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP,
+            isWindow, isIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize);
         setUpCacheTransceiver();
         std::vector<std::shared_ptr<WrappedLlmRequest>> requests;
         int requestId = 0;
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index 785866bb4e..f3638728e9 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -436,11 +436,11 @@ def run_disaggregated_test(example_dir,
     cleanup_output_files()
     run_env = env.copy()
 
-    # on some CI nodes , we set UCX_TLS to "^ib" to avoid the issue that IB equipped but not available.
+    # on some CI nodes , we set UCX_TLS to "^ib,gdr_copy" to avoid the issue that IB equipped but not available, and gdr_copy pin buffer failed.
     # we set UCX_MM_ERROR_HANDLING to "y" to avoid the issue that NIXL cannot use IB or TCP for notify on some CI nodes,
     # setting it to "y" will enable NIXL to use system memory for notify.
 
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
     run_env["UCX_MM_ERROR_HANDLING"] = "y"
     num_ranks, config_file = get_test_config(test_desc, example_dir,
                                              os.path.dirname(__file__))
@@ -1260,7 +1260,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root,
             os.symlink(src, dst, target_is_directory=True)
     env = llm_venv._new_env.copy()
     env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
+    env["UCX_TLS"] = "^ib,gdr_copy"
     run_disaggregated_test(disaggregated_example_root,
                            "deepseek_v3_lite_fp8_ucx",
                            env=env,
@@ -1287,7 +1287,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root,
             os.symlink(src, dst, target_is_directory=True)
     env = llm_venv._new_env.copy()
     env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
+    env["UCX_TLS"] = "^ib,gdr_copy"
     env["UCX_MM_ERROR_HANDLING"] = "y"
     run_disaggregated_test(disaggregated_example_root,
                            "deepseek_v3_lite_fp8_nixl",
@@ -1313,7 +1313,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu(
             os.symlink(src, dst, target_is_directory=True)
     env = llm_venv._new_env.copy()
     env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
+    env["UCX_TLS"] = "^ib,gdr_copy"
 
     run_disaggregated_test(disaggregated_example_root,
                            "deepseek_v3_lite_fp8_tp1",
@@ -1595,7 +1595,7 @@ def run_disaggregated_benchmark(example_dir,
                                 skip_warmup=False):
     """Run disaggregated test with given configuration."""
     run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
     run_env["UCX_MM_ERROR_HANDLING"] = "y"
     workers_cmd = [
         'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
@@ -1776,7 +1776,7 @@ def run_disaggregated_aiperf(config_file,
     """
     cleanup_output_files()
     run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
     run_env["UCX_MM_ERROR_HANDLING"] = "y"
 
     workers_cmd = [
@@ -2302,7 +2302,7 @@ def run_disaggregated_cancel_test(example_dir,
     """Run disaggregated test with request cancellation stress test."""
     cleanup_output_files()
     run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
 
     num_ranks, config_file = get_test_config(test_desc, example_dir,
                                              os.path.dirname(__file__))
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index f72c072aaa..00b61bbeea 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -186,7 +186,7 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
 
     with MPIPoolExecutor(max_workers=2,
                          env={
-                             "UCX_TLS": "^ib",
+                             "UCX_TLS": "^ib,gdr_copy",
                              "UCX_MM_ERROR_HANDLING": "y"
                          }) as executor:
         futures = []
@@ -336,7 +336,7 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
 
     with MPIPoolExecutor(max_workers=2,
                          env={
-                             "UCX_TLS": "^ib",
+                             "UCX_TLS": "^ib,gdr_copy",
                              "UCX_MM_ERROR_HANDLING": "y"
                          }) as executor:
         futures = []
@@ -447,7 +447,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
     mpi_info.Set("oversubscribe", "true")
     with MPIPoolExecutor(max_workers=2,
                          env={
-                             "UCX_TLS": "^ib",
+                             "UCX_TLS": "^ib,gdr_copy",
                              "UCX_MM_ERROR_HANDLING": "y",
                              "OMPI_MCA_rmaps_base_oversubscribe": "1"
                          },
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index a3fbe818fb..cb4257cf17 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -337,8 +337,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_dec
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5879620)
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False] SKIP (https://nvbugs/5879625)
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] SKIP (https://nvbugs/5879625)
-cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] SKIP (https://nvbugs/5880261)
-cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] SKIP (https://nvbugs/5880261)
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False] SKIP (https://nvbugs/5879625)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limitinf-beta0-alpha0.1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5839137)