mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 07:53:55 +08:00
[https://nvbugs/5880261][fix] fix cacheTransceiver (#11409)
Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
This commit is contained in:
parent
29e44dd749
commit
0a9ddf8c17
@ -179,8 +179,8 @@ void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& reque
|
||||
TLLM_CHECK(deviceId == mAgentConnectionManager->getDeviceId());
|
||||
for (size_t i = 0; i < preAllocateBuffers.size(); i++)
|
||||
{
|
||||
bufferDescs.emplace_back(
|
||||
reinterpret_cast<uintptr_t>(preAllocateBuffers[i]->data()), preAllocateBuffers[i]->getSize(), deviceId);
|
||||
bufferDescs.emplace_back(reinterpret_cast<uintptr_t>(preAllocateBuffers[i]->data()),
|
||||
preAllocateBuffers[i]->getSizeInBytes(), deviceId);
|
||||
}
|
||||
std::string address = mAgentConnectionManager->getAgent()->getLocalConnectionInfo();
|
||||
std::optional<std::string> metadataOpt = std::nullopt;
|
||||
|
||||
@ -1606,8 +1606,8 @@ TEST_P(UnexpectedTerminationRaceTest, UnexpectedTerminationRaceTest)
|
||||
if (mIsContext || mIsGeneration)
|
||||
{
|
||||
bool enableDP = mIsContext ? contextDP : generationDP;
|
||||
setUpCacheManager(
|
||||
numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP, isWindow);
|
||||
setUpCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP,
|
||||
isWindow, isIndexerKCache, indexerDimPerHead, indexerKCacheQuantBlockSize);
|
||||
setUpCacheTransceiver();
|
||||
std::vector<std::shared_ptr<WrappedLlmRequest>> requests;
|
||||
int requestId = 0;
|
||||
|
||||
@ -436,11 +436,11 @@ def run_disaggregated_test(example_dir,
|
||||
cleanup_output_files()
|
||||
run_env = env.copy()
|
||||
|
||||
# on some CI nodes , we set UCX_TLS to "^ib" to avoid the issue that IB equipped but not available.
|
||||
# on some CI nodes , we set UCX_TLS to "^ib,gdr_copy" to avoid the issue that IB equipped but not available, and gdr_copy pin buffer failed.
|
||||
# we set UCX_MM_ERROR_HANDLING to "y" to avoid the issue that NIXL cannot use IB or TCP for notify on some CI nodes,
|
||||
# setting it to "y" will enable NIXL to use system memory for notify.
|
||||
|
||||
run_env["UCX_TLS"] = "^ib"
|
||||
run_env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
run_env["UCX_MM_ERROR_HANDLING"] = "y"
|
||||
num_ranks, config_file = get_test_config(test_desc, example_dir,
|
||||
os.path.dirname(__file__))
|
||||
@ -1260,7 +1260,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root,
|
||||
os.symlink(src, dst, target_is_directory=True)
|
||||
env = llm_venv._new_env.copy()
|
||||
env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
env["UCX_TLS"] = "^ib"
|
||||
env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
run_disaggregated_test(disaggregated_example_root,
|
||||
"deepseek_v3_lite_fp8_ucx",
|
||||
env=env,
|
||||
@ -1287,7 +1287,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root,
|
||||
os.symlink(src, dst, target_is_directory=True)
|
||||
env = llm_venv._new_env.copy()
|
||||
env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
|
||||
env["UCX_TLS"] = "^ib"
|
||||
env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
env["UCX_MM_ERROR_HANDLING"] = "y"
|
||||
run_disaggregated_test(disaggregated_example_root,
|
||||
"deepseek_v3_lite_fp8_nixl",
|
||||
@ -1313,7 +1313,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx_tp1_single_gpu(
|
||||
os.symlink(src, dst, target_is_directory=True)
|
||||
env = llm_venv._new_env.copy()
|
||||
env["TRTLLM_USE_UCX_KVCACHE"] = "1"
|
||||
env["UCX_TLS"] = "^ib"
|
||||
env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
|
||||
run_disaggregated_test(disaggregated_example_root,
|
||||
"deepseek_v3_lite_fp8_tp1",
|
||||
@ -1595,7 +1595,7 @@ def run_disaggregated_benchmark(example_dir,
|
||||
skip_warmup=False):
|
||||
"""Run disaggregated test with given configuration."""
|
||||
run_env = env.copy()
|
||||
run_env["UCX_TLS"] = "^ib"
|
||||
run_env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
run_env["UCX_MM_ERROR_HANDLING"] = "y"
|
||||
workers_cmd = [
|
||||
'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
|
||||
@ -1776,7 +1776,7 @@ def run_disaggregated_aiperf(config_file,
|
||||
"""
|
||||
cleanup_output_files()
|
||||
run_env = env.copy()
|
||||
run_env["UCX_TLS"] = "^ib"
|
||||
run_env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
run_env["UCX_MM_ERROR_HANDLING"] = "y"
|
||||
|
||||
workers_cmd = [
|
||||
@ -2302,7 +2302,7 @@ def run_disaggregated_cancel_test(example_dir,
|
||||
"""Run disaggregated test with request cancellation stress test."""
|
||||
cleanup_output_files()
|
||||
run_env = env.copy()
|
||||
run_env["UCX_TLS"] = "^ib"
|
||||
run_env["UCX_TLS"] = "^ib,gdr_copy"
|
||||
|
||||
num_ranks, config_file = get_test_config(test_desc, example_dir,
|
||||
os.path.dirname(__file__))
|
||||
|
||||
@ -186,7 +186,7 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
|
||||
|
||||
with MPIPoolExecutor(max_workers=2,
|
||||
env={
|
||||
"UCX_TLS": "^ib",
|
||||
"UCX_TLS": "^ib,gdr_copy",
|
||||
"UCX_MM_ERROR_HANDLING": "y"
|
||||
}) as executor:
|
||||
futures = []
|
||||
@ -336,7 +336,7 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
|
||||
|
||||
with MPIPoolExecutor(max_workers=2,
|
||||
env={
|
||||
"UCX_TLS": "^ib",
|
||||
"UCX_TLS": "^ib,gdr_copy",
|
||||
"UCX_MM_ERROR_HANDLING": "y"
|
||||
}) as executor:
|
||||
futures = []
|
||||
@ -447,7 +447,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
|
||||
mpi_info.Set("oversubscribe", "true")
|
||||
with MPIPoolExecutor(max_workers=2,
|
||||
env={
|
||||
"UCX_TLS": "^ib",
|
||||
"UCX_TLS": "^ib,gdr_copy",
|
||||
"UCX_MM_ERROR_HANDLING": "y",
|
||||
"OMPI_MCA_rmaps_base_oversubscribe": "1"
|
||||
},
|
||||
|
||||
@ -337,8 +337,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_dec
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5879620)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False] SKIP (https://nvbugs/5879625)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] SKIP (https://nvbugs/5879625)
|
||||
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] SKIP (https://nvbugs/5880261)
|
||||
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] SKIP (https://nvbugs/5880261)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False] SKIP (https://nvbugs/5879625)
|
||||
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limitinf-beta0-alpha0.1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5839137)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user