[https://nvbugs/5880261][fix] fix cacheTransceiver (#11409)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
2026-02-16 07:53:55 +08:00 · 2026-02-15 10:40:44 +08:00 · 2026-02-15 10:40:44 +08:00 · 0a9ddf8c17
commit 0a9ddf8c17
parent 29e44dd749
5 changed files with 15 additions and 17 deletions
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
@ -179,8 +179,8 @@ void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& reque
    TLLM_CHECK(deviceId == mAgentConnectionManager->getDeviceId());
    for (size_t i = 0; i < preAllocateBuffers.size(); i++)
    {
-        bufferDescs.emplace_back(
-            reinterpret_cast<uintptr_t>(preAllocateBuffers[i]->data()), preAllocateBuffers[i]->getSize(), deviceId);
+        bufferDescs.emplace_back(reinterpret_cast<uintptr_t>(preAllocateBuffers[i]->data()),
+            preAllocateBuffers[i]->getSizeInBytes(), deviceId);
    }
    std::string address = mAgentConnectionManager->getAgent()->getLocalConnectionInfo();
    std::optional<std::string> metadataOpt = std::nullopt;
--- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
+++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@ -1606,8 +1606,8 @@ TEST_P(UnexpectedTerminationRaceTest, UnexpectedTerminationRaceTest)
    if (mIsContext || mIsGeneration)
    {
        bool enableDP = mIsContext ? contextDP : generationDP;
-        setUpCacheManager(
-            numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP, isWindow);
+        setUpCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP,
+            isWindow, isIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize);
        setUpCacheTransceiver();
        std::vector<std::shared_ptr<WrappedLlmRequest>> requests;
        int requestId = 0;
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@ -436,11 +436,11 @@ def run_disaggregated_test(example_dir,
    cleanup_output_files()
    run_env = env.copy()

-    # on some CI nodes , we set UCX_TLS to "^ib" to avoid the issue that IB equipped but not available.
+    # on some CI nodes , we set UCX_TLS to "^ib,gdr_copy" to avoid the issue that IB equipped but not available, and gdr_copy pin buffer failed.
    # we set UCX_MM_ERROR_HANDLING to "y" to avoid the issue that NIXL cannot use IB or TCP for notify on some CI nodes,
    # setting it to "y" will enable NIXL to use system memory for notify.

-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
    run_env["UCX_MM_ERROR_HANDLING"] = "y"
    num_ranks, config_file = get_test_config(test_desc, example_dir,
                                             os.path.dirname(__file__))
@ -1260,7 +1260,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root,
            os.symlink(src, dst, target_is_directory=True)
    env = llm_venv._new_env.copy()
    env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
+    env["UCX_TLS"] = "^ib,gdr_copy"
    run_disaggregated_test(disaggregated_example_root,
                           "deepseek_v3_lite_fp8_ucx",
                           env=env,
@ -1287,7 +1287,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root,
            os.symlink(src, dst, target_is_directory=True)
    env = llm_venv._new_env.copy()
    env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
+    env["UCX_TLS"] = "^ib,gdr_copy"
    env["UCX_MM_ERROR_HANDLING"] = "y"
    run_disaggregated_test(disaggregated_example_root,
                           "deepseek_v3_lite_fp8_nixl",
@ -1313,7 +1313,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu(
            os.symlink(src, dst, target_is_directory=True)
    env = llm_venv._new_env.copy()
    env["TRTLLM_USE_UCX_KVCACHE"] = "1"
-    env["UCX_TLS"] = "^ib"
+    env["UCX_TLS"] = "^ib,gdr_copy"

    run_disaggregated_test(disaggregated_example_root,
                           "deepseek_v3_lite_fp8_tp1",
@ -1595,7 +1595,7 @@ def run_disaggregated_benchmark(example_dir,
                                skip_warmup=False):
    """Run disaggregated test with given configuration."""
    run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
    run_env["UCX_MM_ERROR_HANDLING"] = "y"
    workers_cmd = [
        'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
@ -1776,7 +1776,7 @@ def run_disaggregated_aiperf(config_file,
    """
    cleanup_output_files()
    run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"
    run_env["UCX_MM_ERROR_HANDLING"] = "y"

    workers_cmd = [
@ -2302,7 +2302,7 @@ def run_disaggregated_cancel_test(example_dir,
    """Run disaggregated test with request cancellation stress test."""
    cleanup_output_files()
    run_env = env.copy()
-    run_env["UCX_TLS"] = "^ib"
+    run_env["UCX_TLS"] = "^ib,gdr_copy"

    num_ranks, config_file = get_test_config(test_desc, example_dir,
                                             os.path.dirname(__file__))
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@ -186,7 +186,7 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,

    with MPIPoolExecutor(max_workers=2,
                         env={
-                             "UCX_TLS": "^ib",
+                             "UCX_TLS": "^ib,gdr_copy",
                             "UCX_MM_ERROR_HANDLING": "y"
                         }) as executor:
        futures = []
@ -336,7 +336,7 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,

    with MPIPoolExecutor(max_workers=2,
                         env={
-                             "UCX_TLS": "^ib",
+                             "UCX_TLS": "^ib,gdr_copy",
                             "UCX_MM_ERROR_HANDLING": "y"
                         }) as executor:
        futures = []
@ -447,7 +447,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
    mpi_info.Set("oversubscribe", "true")
    with MPIPoolExecutor(max_workers=2,
                         env={
-                             "UCX_TLS": "^ib",
+                             "UCX_TLS": "^ib,gdr_copy",
                             "UCX_MM_ERROR_HANDLING": "y",
                             "OMPI_MCA_rmaps_base_oversubscribe": "1"
                         },
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -337,8 +337,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_dec
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5879620)
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False] SKIP (https://nvbugs/5879625)
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] SKIP (https://nvbugs/5879625)
-cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] SKIP (https://nvbugs/5880261)
-cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] SKIP (https://nvbugs/5880261)
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False] SKIP (https://nvbugs/5879625)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limitinf-beta0-alpha0.1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5839137)