From 0a9ddf8c17e5c8bf63cbbd60dc5f07247f89d7d1 Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Sun, 15 Feb 2026 10:40:44 +0800 Subject: [PATCH] [https://nvbugs/5880261][fix] fix cacheTransceiver (#11409) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> --- .../agent_utils/connection.cpp | 4 ++-- .../multi_gpu/cacheTransceiverTest.cpp | 4 ++-- .../defs/disaggregated/test_disaggregated.py | 16 ++++++++-------- .../test_disaggregated_single_gpu.py | 6 +++--- tests/integration/test_lists/waives.txt | 2 -- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp index 3e9c7485bb..36962720ba 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp @@ -179,8 +179,8 @@ void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& reque TLLM_CHECK(deviceId == mAgentConnectionManager->getDeviceId()); for (size_t i = 0; i < preAllocateBuffers.size(); i++) { - bufferDescs.emplace_back( - reinterpret_cast(preAllocateBuffers[i]->data()), preAllocateBuffers[i]->getSize(), deviceId); + bufferDescs.emplace_back(reinterpret_cast(preAllocateBuffers[i]->data()), + preAllocateBuffers[i]->getSizeInBytes(), deviceId); } std::string address = mAgentConnectionManager->getAgent()->getLocalConnectionInfo(); std::optional metadataOpt = std::nullopt; diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index cfda94db2e..7fe546a3ef 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -1606,8 +1606,8 @@ TEST_P(UnexpectedTerminationRaceTest, UnexpectedTerminationRaceTest) if (mIsContext || mIsGeneration) { bool enableDP = mIsContext ? contextDP : generationDP; - setUpCacheManager( - numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP, isWindow); + setUpCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP, + isWindow, isIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize); setUpCacheTransceiver(); std::vector> requests; int requestId = 0; diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 785866bb4e..f3638728e9 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -436,11 +436,11 @@ def run_disaggregated_test(example_dir, cleanup_output_files() run_env = env.copy() - # on some CI nodes , we set UCX_TLS to "^ib" to avoid the issue that IB equipped but not available. + # on some CI nodes , we set UCX_TLS to "^ib,gdr_copy" to avoid the issue that IB equipped but not available, and gdr_copy pin buffer failed. # we set UCX_MM_ERROR_HANDLING to "y" to avoid the issue that NIXL cannot use IB or TCP for notify on some CI nodes, # setting it to "y" will enable NIXL to use system memory for notify. - run_env["UCX_TLS"] = "^ib" + run_env["UCX_TLS"] = "^ib,gdr_copy" run_env["UCX_MM_ERROR_HANDLING"] = "y" num_ranks, config_file = get_test_config(test_desc, example_dir, os.path.dirname(__file__)) @@ -1260,7 +1260,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root, os.symlink(src, dst, target_is_directory=True) env = llm_venv._new_env.copy() env["TRTLLM_USE_UCX_KVCACHE"] = "1" - env["UCX_TLS"] = "^ib" + env["UCX_TLS"] = "^ib,gdr_copy" run_disaggregated_test(disaggregated_example_root, "deepseek_v3_lite_fp8_ucx", env=env, @@ -1287,7 +1287,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root, os.symlink(src, dst, target_is_directory=True) env = llm_venv._new_env.copy() env["TRTLLM_USE_NIXL_KVCACHE"] = "1" - env["UCX_TLS"] = "^ib" + env["UCX_TLS"] = "^ib,gdr_copy" env["UCX_MM_ERROR_HANDLING"] = "y" run_disaggregated_test(disaggregated_example_root, "deepseek_v3_lite_fp8_nixl", @@ -1313,7 +1313,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu( os.symlink(src, dst, target_is_directory=True) env = llm_venv._new_env.copy() env["TRTLLM_USE_UCX_KVCACHE"] = "1" - env["UCX_TLS"] = "^ib" + env["UCX_TLS"] = "^ib,gdr_copy" run_disaggregated_test(disaggregated_example_root, "deepseek_v3_lite_fp8_tp1", @@ -1595,7 +1595,7 @@ def run_disaggregated_benchmark(example_dir, skip_warmup=False): """Run disaggregated test with given configuration.""" run_env = env.copy() - run_env["UCX_TLS"] = "^ib" + run_env["UCX_TLS"] = "^ib,gdr_copy" run_env["UCX_MM_ERROR_HANDLING"] = "y" workers_cmd = [ 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', @@ -1776,7 +1776,7 @@ def run_disaggregated_aiperf(config_file, """ cleanup_output_files() run_env = env.copy() - run_env["UCX_TLS"] = "^ib" + run_env["UCX_TLS"] = "^ib,gdr_copy" run_env["UCX_MM_ERROR_HANDLING"] = "y" workers_cmd = [ @@ -2302,7 +2302,7 @@ def run_disaggregated_cancel_test(example_dir, """Run disaggregated test with request cancellation stress test.""" cleanup_output_files() run_env = env.copy() - run_env["UCX_TLS"] = "^ib" + run_env["UCX_TLS"] = "^ib,gdr_copy" num_ranks, config_file = get_test_config(test_desc, example_dir, os.path.dirname(__file__)) diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index f72c072aaa..00b61bbeea 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -186,7 +186,7 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt, with MPIPoolExecutor(max_workers=2, env={ - "UCX_TLS": "^ib", + "UCX_TLS": "^ib,gdr_copy", "UCX_MM_ERROR_HANDLING": "y" }) as executor: futures = [] @@ -336,7 +336,7 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph, with MPIPoolExecutor(max_workers=2, env={ - "UCX_TLS": "^ib", + "UCX_TLS": "^ib,gdr_copy", "UCX_MM_ERROR_HANDLING": "y" }) as executor: futures = [] @@ -447,7 +447,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path, mpi_info.Set("oversubscribe", "true") with MPIPoolExecutor(max_workers=2, env={ - "UCX_TLS": "^ib", + "UCX_TLS": "^ib,gdr_copy", "UCX_MM_ERROR_HANDLING": "y", "OMPI_MCA_rmaps_base_oversubscribe": "1" }, diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index a3fbe818fb..cb4257cf17 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -337,8 +337,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_dec accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5879620) accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False] SKIP (https://nvbugs/5879625) accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] SKIP (https://nvbugs/5879625) -cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] SKIP (https://nvbugs/5880261) -cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] SKIP (https://nvbugs/5880261) accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False] SKIP (https://nvbugs/5879625) unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limitinf-beta0-alpha0.1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5839137)