diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index 147dc64081..27676fb656 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -89,7 +89,7 @@ std::unique_ptr CacheTransceiverFactory::createCacheTransc } else { - backendType = executor::CacheTransceiverConfig::BackendType::UCX; + backendType = executor::CacheTransceiverConfig::BackendType::NIXL; } } cacheTransceiverConfig.value().setBackendType(backendType); diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 69e8891f44..12c29ba2b4 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -91,6 +91,17 @@ RUN if [ -f /etc/redhat-release ]; then \ grep "^Name:" METADATA; \ fi +# Install UCX first +COPY docker/common/install_ucx.sh install_ucx.sh +RUN bash ./install_ucx.sh && rm install_ucx.sh + +# Install NIXL +COPY docker/common/install_nixl.sh install_nixl.sh +RUN bash ./install_nixl.sh && rm install_nixl.sh + +# Install etcd +COPY docker/common/install_etcd.sh install_etcd.sh +RUN bash ./install_etcd.sh && rm install_etcd.sh FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton @@ -111,15 +122,6 @@ COPY docker/common/install_triton.sh \ RUN bash ./install_triton.sh && rm install_triton.sh -# Install UCX first -RUN bash ./install_ucx.sh && rm install_ucx.sh - -# Install NIXL -RUN bash ./install_nixl.sh && rm install_nixl.sh - -# Install etcd -RUN bash ./install_etcd.sh && rm install_etcd.sh - FROM ${DEVEL_IMAGE} AS wheel WORKDIR /src/tensorrt_llm COPY benchmarks benchmarks diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md index 8af2c188a5..a6335b9c92 100644 --- a/docs/source/features/disagg-serving.md +++ b/docs/source/features/disagg-serving.md @@ -106,7 +106,7 @@ cache_transceiver_config: max_tokens_in_buffer: ``` -`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX. +`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is NIXL. `max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance. diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md index 68db9403d3..2cf211038d 100644 --- a/docs/source/installation/linux.md +++ b/docs/source/installation/linux.md @@ -22,6 +22,9 @@ pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 sudo apt-get -y install libopenmpi-dev + + # Optional step: Only required for disagg-serving + sudo apt-get -y install libzmq3-dev ``` PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs and SBSA platform. On prior GPUs or Linux x86_64 platform, this extra installation is not required. diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md index 3943d5fefb..40f1ff4611 100644 --- a/examples/disaggregated/README.md +++ b/examples/disaggregated/README.md @@ -12,7 +12,7 @@ The `trtllm-serve` command supports the `extra-llm-config.yaml` parameter. In th ```yaml cache_transceiver_config: - # KV cache transmission backend. Valid options include `DEFAULT` (i.e., UCX), `UCX`, `NIXL`. + # KV cache transmission backend. Valid options include `DEFAULT` (i.e., NIXL), `UCX`, `NIXL`. backend: # KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance. max_tokens_in_buffer: diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py index 5cc7d35c21..852d0352cf 100644 --- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py +++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py @@ -38,10 +38,10 @@ def create_kv_cache_transceiver( if cache_transceiver_config.backend == BackendTypeCpp.DEFAULT: # When cache_transceiver_config.backend is not set, fallback to env_vars settings - # UCX is the default backend - cache_transceiver_config.backend = BackendTypeCpp.UCX + # NIXL is the default backend + cache_transceiver_config.backend = BackendTypeCpp.NIXL # Ordered by priority - env_vars = [("TRTLLM_USE_NIXL_KVCACHE", BackendTypeCpp.NIXL), + env_vars = [("TRTLLM_USE_UCX_KVCACHE", BackendTypeCpp.UCX), ("TRTLLM_USE_MPI_KVCACHE", BackendTypeCpp.MPI)] for env_var, be_type in env_vars: if getenv(env_var) == "1": diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index b871582fc1..ffc84143ae 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -426,6 +426,9 @@ def run_disaggregated_test(example_dir, config_file ] else: + pytest.skip( + "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend." + ) with open(config_file, 'r') as f: config = yaml.safe_load(f)