From d010b2043a1a3a5148759f9ec6075da1c625c74d Mon Sep 17 00:00:00 2001 From: Shi Xiaowei <39303645+Shixiaowei02@users.noreply.github.com> Date: Mon, 25 Aug 2025 20:21:43 +0800 Subject: [PATCH] [TRTLLM-7030][fix] BREAKING CHANGE: Mismatch between docs and actual commands (#7191) Signed-off-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com> --- benchmarks/cpp/README.md | 4 +- docs/source/advanced/disaggregated-service.md | 11 ----- examples/cpp/executor/README.md | 4 +- examples/disaggregated/README.md | 47 ++++++++++++------- examples/disaggregated/disagg_config.yaml | 4 +- examples/disaggregated/slurm/gen_yaml.py | 4 +- tensorrt_llm/llmapi/llm_args.py | 2 +- .../accuracy/test_disaggregated_serving.py | 40 ++++++++-------- .../disagg_config_cache_aware_balance.yaml | 4 +- ...onfig_cache_aware_balance_deepseek_v3.yaml | 4 +- .../disagg_config_cache_reuse.yaml | 4 +- ...disagg_config_cache_reuse_deepseek_v3.yaml | 4 +- .../disagg_config_conditional.yaml | 4 +- ...disagg_config_conditional_deepseek_v3.yaml | 4 +- .../disagg_config_ctxpp2_genpp2.yaml | 4 +- .../disagg_config_ctxpp2_gentp2.yaml | 4 +- .../disagg_config_ctxpp4_genpp4.yaml | 4 +- ...config_ctxtp1_gentp1_deepseek_v3_lite.yaml | 4 +- ...txtp1_gentp1_deepseek_v3_lite_one_mtp.yaml | 4 +- ..._v3_lite_one_mtp_attention_dp_overlap.yaml | 4 +- ...txtp1_gentp1_deepseek_v3_lite_two_mtp.yaml | 4 +- .../disagg_config_ctxtp2_genpp2.yaml | 4 +- .../disagg_config_ctxtp2_gentp1.yaml | 4 +- ...sagg_config_ctxtp2_gentp1_trt_backend.yaml | 4 +- ...config_ctxtp2_gentp2_deepseek_v3_lite.yaml | 4 +- ..._gentp2_deepseek_v3_lite_attention_dp.yaml | 4 +- ...tp2_deepseek_v3_lite_attention_dp_one.yaml | 4 +- ...deepseek_v3_lite_attention_dp_one_mtp.yaml | 4 +- ...deepseek_v3_lite_attention_dp_overlap.yaml | 4 +- ..._lite_attention_dp_overlap_cuda_graph.yaml | 4 +- ...ig_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml | 4 +- ...g_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml | 4 +- ...2_deepseek_v3_lite_overlap_cuda_graph.yaml | 4 +- ...ig_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml | 4 +- .../disagg_config_ctxtp2pp2_gentp2pp2.yaml | 4 +- .../disagg_config_cuda_graph_padding.yaml | 4 +- .../disagg_config_diff_max_tokens.yaml | 4 +- .../test_configs/disagg_config_gen_only.yaml | 2 +- .../disagg_config_gen_only_bs1.yaml | 4 +- .../disagg_config_gen_only_trt_backend.yaml | 2 +- .../disagg_config_load_balance.yaml | 4 +- .../test_configs/disagg_config_mixed.yaml | 4 +- .../test_configs/disagg_config_ngram.yaml | 4 +- .../test_configs/disagg_config_overlap.yaml | 4 +- .../disagg_config_trt_backend.yaml | 4 +- .../disagg_config_trtllm_sampler.yaml | 4 +- .../defs/disaggregated/test_disaggregated.py | 4 +- .../disaggregated/test_disaggregated_etcd.py | 4 +- .../test_disaggregated_single_gpu.py | 6 +-- tests/unittest/llmapi/test_llm_args.py | 6 +-- 50 files changed, 144 insertions(+), 140 deletions(-) diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md index 0b89bae602..ae3287faf0 100644 --- a/benchmarks/cpp/README.md +++ b/benchmarks/cpp/README.md @@ -336,7 +336,7 @@ cd cpp/build `disaggServerBenchmark` only supports `decoder-only` models. Here is the basic usage: ``` -export TRTLLM_USE_MPI_KVCACHE=1 +export TRTLLM_USE_UCX_KVCACHE=1 mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \ --generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path} ``` @@ -344,7 +344,7 @@ This command will launch m context engines and n generation engines. You need to for example: ``` -export TRTLLM_USE_MPI_KVCACHE=1 +export TRTLLM_USE_UCX_KVCACHE=1 mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path} # need 6 gpus and 7 processes to launch the benchmark. diff --git a/docs/source/advanced/disaggregated-service.md b/docs/source/advanced/disaggregated-service.md index d8e376d62c..a9955b940a 100644 --- a/docs/source/advanced/disaggregated-service.md +++ b/docs/source/advanced/disaggregated-service.md @@ -66,17 +66,6 @@ A. Yes, it's recommended that different executor use different GPUs . We support ### Debugging FAQs -*Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`* - -A. please set `backendType` of `CacheTransceiverConfig`. -```cpp -ExecutorConfig executorConfig{...}; - -executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT)); -``` - -When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will transfer the KV cache using `CUDA-aware MPI`. All executor processes involved must share the same MPI world communicator. Consequently, with `TRTLLM_USE_MPI_KVCACHE=1`, TRT-LLM only supports launching multiple executors via `MPI`. Additionally, the `CommunicationMode` for the executors must be set to `kLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for the `disaggregated-service`. These restrictions do not apply when `TRTLLM_USE_UCX_KVCACHE=1` is set. - *Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?* A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer. diff --git a/examples/cpp/executor/README.md b/examples/cpp/executor/README.md index fdb5b0d434..4cc9b72ad9 100644 --- a/examples/cpp/executor/README.md +++ b/examples/cpp/executor/README.md @@ -124,10 +124,10 @@ From the `examples/cpp/executor/build` folder, you can also run the `executorExa ``` ./executorExampleDisaggregated -h ``` -Note setting `TRTLLM_USE_MPI_KVCACHE=1` is required to run disaggregated executor. +Note setting `TRTLLM_USE_UCX_KVCACHE=1` is required to run disaggregated executor. For example, you can run : ``` -export TRTLLM_USE_MPI_KVCACHE=1 +export TRTLLM_USE_UCX_KVCACHE=1 mpirun -n --allow-run-as-root --oversubscribe ./executorExampleDisaggregated --context_engine_dir --context_rank_size --generation_engine_dir --generation_rank_size --input_tokens ../inputTokens.csv diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md index 713e69e6be..2319e3f5b9 100644 --- a/examples/disaggregated/README.md +++ b/examples/disaggregated/README.md @@ -12,24 +12,39 @@ cache_transceiver_config: max_tokens_in_buffer: ``` -`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX. +`backend` specifies the communication backend for transferring the KV cache, valid options include `DEFAULT`, `UCX`, `NIXL`, and `MPI`, the default backend is `UCX`. -`max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance. +`max_tokens_in_buffer` defines the buffer size for KV cache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance. -You can use multiple `trtllm-serve` commands to launch the context and generation servers that will be used -for disaggregated serving. For example, you could launch two context servers and one generation servers as follows: +You can use multiple `trtllm-serve` commands to launch the context and generation servers required for disaggregated serving. For instance, you might start two context servers and one generation server as shown below. + +Begin by creating `ctx_extra-llm-api-config.yml` and `gen_extra-llm-api-config.yml` following the specified format. + +```yaml +# ctx_extra-llm-api-config.yml + +# The overlap scheduler for context servers is currently disabled, as it is +# not yet supported in disaggregated context server architectures. +disable_overlap_scheduler: True +cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 2048 +``` + +```yaml +# gen_extra-llm-api-config.yml + +cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 2048 +``` + +Then, start the context and generation servers separately. ```bash -# Generate context_extra-llm-api-config.yml -# Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet -echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml - # Start context servers -CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 & -CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 & - -# Generate gen_extra-llm-api-config.yml -echo -e "cache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml +CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --extra_llm_api_options ./ctx_extra-llm-api-config.yml &> log_ctx_0 & +CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --extra_llm_api_options ./ctx_extra-llm-api-config.yml &> log_ctx_1 & # Start generation servers CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 & @@ -95,8 +110,8 @@ After this, you can enable the dynamic scaling feature for the use case above as export TRTLLM_USE_UCX_KVCACHE=1 # Context servers -CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --server_role CONTEXT --extra_llm_api_options ./context_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_ctx_0 & -CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --server_role CONTEXT --extra_llm_api_options ./context_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_ctx_1 & +CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_ctx_0 & +CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_ctx_1 & # Generation servers CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --server_role GENERATION --extra_llm_api_options ./gen_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_gen_0 & @@ -180,4 +195,4 @@ trtllm-serve disaggregated -c disagg_config.yaml ## Know Issues -The MPI communication backend for kvCache transfer has been deprecated and may not be supported in the future. When using the MPI backend, the environment variable `TRTLLM_USE_MPI_KVCACHE=1` should be set to avoid conflicts between mpi4py and kvCache transfer. +The MPI communication backend for KV cache transfer has been deprecated and may not be supported in the future. When using the MPI backend, the environment variable `TRTLLM_USE_MPI_KVCACHE=1` should be set to avoid conflicts between mpi4py and KV cache transfer. diff --git a/examples/disaggregated/disagg_config.yaml b/examples/disaggregated/disagg_config.yaml index ae72c1b074..6b2b4f7123 100644 --- a/examples/disaggregated/disagg_config.yaml +++ b/examples/disaggregated/disagg_config.yaml @@ -11,7 +11,7 @@ context_servers: kv_cache_config: free_gpu_memory_fraction: 0.2 cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8001" generation_servers: @@ -19,6 +19,6 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8002" diff --git a/examples/disaggregated/slurm/gen_yaml.py b/examples/disaggregated/slurm/gen_yaml.py index a3f8ad32ac..b3865fd700 100644 --- a/examples/disaggregated/slurm/gen_yaml.py +++ b/examples/disaggregated/slurm/gen_yaml.py @@ -197,7 +197,7 @@ def gen_config_file(config_path: str, }, 'cache_transceiver_config': { 'max_tokens_in_buffer': cache_transceiver_max_num_tokens, - 'backend': 'default', + 'backend': 'DEFAULT', }, }, 'generation_servers': { @@ -225,7 +225,7 @@ def gen_config_file(config_path: str, }, 'cache_transceiver_config': { 'max_tokens_in_buffer': cache_transceiver_max_num_tokens, - 'backend': 'default', + 'backend': 'DEFAULT', }, 'stream_interval': 20, } diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index b7d46ed6fa..2fbe67c7fc 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1039,7 +1039,7 @@ class CacheTransceiverConfig(StrictBaseModel, PybindMirror): Configuration for the cache transceiver. """ - backend: Optional[Literal["default", "ucx", "nixl", "mpi"]] = Field( + backend: Optional[Literal["DEFAULT", "UCX", "NIXL", "MPI"]] = Field( default=None, description= "The communication backend type to use for the cache transceiver.") diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 0343c8bb05..b83ce6aab4 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -260,7 +260,7 @@ def run_parallel_test(model_name: str, model_path: str, ctx_pp: int, "disable_overlap_scheduler": True, "kv_cache_config": kv_cache_config, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } gen_server_config = { @@ -269,7 +269,7 @@ def run_parallel_test(model_name: str, model_path: str, ctx_pp: int, "disable_overlap_scheduler": True, "kv_cache_config": kv_cache_config, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } @@ -309,8 +309,8 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): gen_server_config = { "disable_overlap_scheduler": disable_overlap_scheduler } - ctx_server_config["cache_transceiver_config"] = {"backend": "default"} - gen_server_config["cache_transceiver_config"] = {"backend": "default"} + ctx_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} + gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} disaggregated_server_config = { "hostname": "localhost", "port": 8000, @@ -351,7 +351,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): "disable_overlap_scheduler": True, "kv_cache_config": kv_cache_config, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } gen_server_config = { @@ -359,7 +359,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): "speculative_config": speculative_decoding_config, "kv_cache_config": kv_cache_config, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } disaggregated_server_config = { @@ -404,7 +404,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): "max_num_tokens": 13393 * 2, "max_batch_size": 1, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" }, "cuda_graph_config": None, } @@ -418,7 +418,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): "max_num_tokens": 13393 * 2, "max_batch_size": 16, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" }, "cuda_graph_config": None, } @@ -472,8 +472,8 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness): def test_auto_dtype(self, overlap_scheduler): ctx_server_config = {"disable_overlap_scheduler": True} gen_server_config = {"disable_overlap_scheduler": overlap_scheduler} - ctx_server_config["cache_transceiver_config"] = {"backend": "default"} - gen_server_config["cache_transceiver_config"] = {"backend": "default"} + ctx_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} + gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} # Keep this low to avoid warmup OOM in CI ctx_server_config["max_seq_len"] = 8192 gen_server_config["max_seq_len"] = 8192 @@ -513,13 +513,13 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): ctx_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } gen_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } disaggregated_server_config = { @@ -550,8 +550,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): def test_auto_dtype(self, overlap_scheduler, mtp_nextn): ctx_server_config = {"disable_overlap_scheduler": True} gen_server_config = {"disable_overlap_scheduler": not overlap_scheduler} - ctx_server_config["cache_transceiver_config"] = {"backend": "default"} - gen_server_config["cache_transceiver_config"] = {"backend": "default"} + ctx_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} + gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} if mtp_nextn > 0: ctx_server_config["speculative_config"] = { "decoding_type": "MTP", @@ -597,14 +597,14 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): "disable_overlap_scheduler": True, "cuda_graph_config": None, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } gen_server_config = { "disable_overlap_scheduler": overlap_scheduler, "cuda_graph_config": None, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } ctx_server_config["kv_cache_config"] = { @@ -648,13 +648,13 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): ctx_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } gen_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } ctx_server_config["cache_transceiver_config"] @@ -686,14 +686,14 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): "disable_overlap_scheduler": True, "cuda_graph_config": None, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } gen_server_config = { "disable_overlap_scheduler": overlap_scheduler, "cuda_graph_config": None, "cache_transceiver_config": { - "backend": "default" + "backend": "DEFAULT" } } disaggregated_server_config = { diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml index 6db8a0f1a9..d64bac8763 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml @@ -21,7 +21,7 @@ context_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" - "localhost:8002" @@ -35,7 +35,7 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT kv_cache_config: enable_block_reuse: True enable_partial_reuse: False diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml index cc275b98c7..fe15f70085 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml @@ -17,7 +17,7 @@ context_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.1 cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8001" - "localhost:8002" @@ -33,7 +33,7 @@ generation_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.1 cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8003" - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml index 86da31c42b..3ad817167e 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml @@ -15,7 +15,7 @@ context_servers: enable_partial_reuse: True event_buffer_max_size: 1024 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -30,6 +30,6 @@ generation_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.05 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml index e76a253c1a..06a4c154b4 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml @@ -15,7 +15,7 @@ context_servers: enable_partial_reuse: True event_buffer_max_size: 1024 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -30,6 +30,6 @@ generation_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.05 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml index 2292fe22aa..28816380fe 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml @@ -18,7 +18,7 @@ context_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.15 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -33,6 +33,6 @@ generation_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.15 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml index 345a958fa5..b7f3420272 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml @@ -18,7 +18,7 @@ context_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.15 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -33,6 +33,6 @@ generation_servers: event_buffer_max_size: 1024 free_gpu_memory_fraction: 0.15 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml index e6a9ab14fe..293e3e604a 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_genpp2.yaml @@ -16,7 +16,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -31,6 +31,6 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml index 6d4e326168..67f41bc7e5 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp2_gentp2.yaml @@ -16,7 +16,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -31,6 +31,6 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml index 6621c05d49..3571692123 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxpp4_genpp4.yaml @@ -16,7 +16,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -31,6 +31,6 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml index 1f63caed57..83f9b3a3e8 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml @@ -10,7 +10,7 @@ context_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -18,6 +18,6 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml index 97c03fbbcb..57eb4ea004 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml @@ -14,7 +14,7 @@ context_servers: pipeline_parallel_size: 1 enable_attention_dp: true cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -23,6 +23,6 @@ generation_servers: pipeline_parallel_size: 1 enable_attention_dp: false cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml index 25612d4a78..4343850c77 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml @@ -14,7 +14,7 @@ context_servers: enable_attention_dp: true disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -24,6 +24,6 @@ generation_servers: enable_attention_dp: true disable_overlap_scheduler: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml index facc460330..837e5df8e3 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml @@ -14,7 +14,7 @@ context_servers: pipeline_parallel_size: 1 enable_attention_dp: true cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -25,4 +25,4 @@ generation_servers: urls: - "localhost:8002" cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml index a6e9b0c85d..ce53fd4626 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_genpp2.yaml @@ -16,7 +16,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -31,6 +31,6 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml index 729bdf2cf9..1335d63adf 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml @@ -10,7 +10,7 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -18,7 +18,7 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml index 388be9d4d6..fa5dffa518 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml @@ -8,7 +8,7 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -16,7 +16,7 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml index 1bc2084286..6b22665e9f 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml @@ -10,7 +10,7 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -18,6 +18,6 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml index 28d4c3556e..80a1a3636a 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml @@ -11,7 +11,7 @@ context_servers: pipeline_parallel_size: 1 enable_attention_dp: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -20,6 +20,6 @@ generation_servers: pipeline_parallel_size: 1 enable_attention_dp: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml index 0d05bef459..9dfb092151 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml @@ -11,7 +11,7 @@ context_servers: pipeline_parallel_size: 1 enable_attention_dp: true cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -20,6 +20,6 @@ generation_servers: pipeline_parallel_size: 1 enable_attention_dp: false cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml index fa771b9e30..4b6bc571da 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml @@ -14,7 +14,7 @@ context_servers: pipeline_parallel_size: 1 enable_attention_dp: true cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -23,7 +23,7 @@ generation_servers: pipeline_parallel_size: 1 enable_attention_dp: false cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml index 9398f7ddd2..26218586f4 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml @@ -11,7 +11,7 @@ context_servers: enable_attention_dp: True disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -21,6 +21,6 @@ generation_servers: enable_attention_dp: True disable_overlap_scheduler: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml index f8c04735eb..99034f8a1a 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml @@ -10,7 +10,7 @@ context_servers: enable_attention_dp: true disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -22,6 +22,6 @@ generation_servers: enable_padding: False disable_overlap_scheduler: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml index 912178b7f6..4cfe18ebaf 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml @@ -9,7 +9,7 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "mpi" + backend: "MPI" urls: - "localhost:8001" generation_servers: @@ -17,6 +17,6 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "mpi" + backend: "MPI" urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml index e4fd09a1ce..3b1aa8fc0e 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml @@ -9,7 +9,7 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "nixl" + backend: "NIXL" urls: - "localhost:8001" generation_servers: @@ -17,6 +17,6 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "nixl" + backend: "NIXL" urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml index 9ace31717e..4c601fbb86 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml @@ -9,7 +9,7 @@ context_servers: pipeline_parallel_size: 1 disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -20,6 +20,6 @@ generation_servers: enable_padding: False disable_overlap_scheduler: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml index b21637529b..d3395938ca 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml @@ -9,7 +9,7 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "ucx" + backend: "UCX" urls: - "localhost:8001" generation_servers: @@ -17,6 +17,6 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "ucx" + backend: "UCX" urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml index 2e862eb6be..db62a89cf7 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2pp2_gentp2pp2.yaml @@ -16,7 +16,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -31,6 +31,6 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml index 8b992d210c..56db3df769 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml @@ -16,7 +16,7 @@ context_servers: batch_sizes: [1,3000] disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -34,6 +34,6 @@ generation_servers: batch_sizes: [1,4,8,16,24,32] disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml index 5391d2db3f..3d9cfda12e 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_diff_max_tokens.yaml @@ -10,7 +10,7 @@ context_servers: max_num_tokens: 512 max_batch_size: 64 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -18,6 +18,6 @@ generation_servers: max_num_tokens: 256 max_batch_size: 32 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml index f42ea826c0..92b1383764 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml @@ -14,7 +14,7 @@ generation_servers: enable_block_reuse: False enable_partial_reuse: False cache_transceiver_config: - backend: default + backend: DEFAULT print_iter_log: True urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml index 4efbc9a949..19d1eca714 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml @@ -17,7 +17,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -32,6 +32,6 @@ generation_servers: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml index 6d9fc7d07f..ad706f8bf1 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml @@ -13,7 +13,7 @@ generation_servers: enable_block_reuse: False enable_partial_reuse: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml index f0766a9c6d..f0593d9ef6 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml @@ -19,7 +19,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" - "localhost:8002" @@ -38,7 +38,7 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: False cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8003" - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml index 31e429c440..27d7ec4ee8 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml @@ -10,7 +10,7 @@ context_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -18,7 +18,7 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml index 2f779f598a..4e3417c732 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml @@ -9,7 +9,7 @@ context_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8001" generation_servers: @@ -17,7 +17,7 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: "default" + backend: "DEFAULT" urls: - "localhost:8002" speculative_config: diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml index 5cdafaed34..55990bbaa6 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml @@ -16,7 +16,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -31,6 +31,6 @@ generation_servers: enable_partial_reuse: False disable_overlap_scheduler: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml index 885991c886..3eb275c87e 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml @@ -10,7 +10,7 @@ context_servers: kv_cache_config: free_gpu_memory_fraction: 0.2 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -18,6 +18,6 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml index b7ecb48b30..fae83e4389 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml @@ -16,7 +16,7 @@ context_servers: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False cache_transceiver_config: - backend: "default" + backend: "DEFAULT" disable_overlap_scheduler: True urls: - "localhost:8001" @@ -32,7 +32,7 @@ generation_servers: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False cache_transceiver_config: - backend: "default" + backend: "DEFAULT" disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 2ece78729c..fed3c05d28 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -1276,8 +1276,8 @@ def test_disaggregated_benchmark_on_diff_backends( if "DeepSeek-V3-Lite" in benchmark_model_root and "fp8" in benchmark_model_root and get_sm_version( ) != 90: pytest.skip("The test should only run on Hopper") - nixl_config = get_config_for_benchmark(benchmark_model_root, "nixl") - ucx_config = get_config_for_benchmark(benchmark_model_root, "ucx") + nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL") + ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX") temp_dir = tempfile.TemporaryDirectory() nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml") ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml") diff --git a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py index 656b9a675d..a495f35faf 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py @@ -244,7 +244,7 @@ def create_config_files(config): context_config_content = """pytorch_backend_config: disable_overlap_scheduler: True cache_transceiver_config: - backend: "default" + backend: "DEFAULT" max_tokens_in_buffer: 2048""" with open(CONTEXT_CONFIG_FILE, 'w') as file: @@ -252,7 +252,7 @@ cache_transceiver_config: # Create generation config file generation_config_content = """cache_transceiver_config: - backend: "default" + backend: "DEFAULT" max_tokens_in_buffer: 2048""" with open(GENERATION_CONFIG_FILE, 'w') as file: diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index 55971c3ad0..0ed814963d 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -131,7 +131,7 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt, kv_cache_configs = [KvCacheConfig(max_tokens=2048 * 8) for _ in range(2)] cache_transceiver_configs = [ - CacheTransceiverConfig(backend="default") for _ in range(2) + CacheTransceiverConfig(backend="DEFAULT") for _ in range(2) ] model_names = [model_path(model) for _ in range(2)] ranks = [0, 1] @@ -274,7 +274,7 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph, for _ in range(2) ] cache_transceiver_configs = [ - CacheTransceiverConfig(backend="default") for _ in range(2) + CacheTransceiverConfig(backend="DEFAULT") for _ in range(2) ] model_names = [model_path(model) for _ in range(2)] ranks = [0, 1] @@ -377,7 +377,7 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path, for _ in range(2) ] cache_transceiver_configs = [ - CacheTransceiverConfig(backend="default") for _ in range(2) + CacheTransceiverConfig(backend="DEFAULT") for _ in range(2) ] model_names = [model_path(model) for _ in range(2)] ranks = [0, 1] diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index 66f0494660..c0de80b265 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -661,15 +661,15 @@ class TestStrictBaseModelArbitraryArgs: def test_cache_transceiver_config_arbitrary_args(self): """Test that CacheTransceiverConfig rejects arbitrary arguments.""" # Valid arguments should work - config = CacheTransceiverConfig(backend="ucx", + config = CacheTransceiverConfig(backend="UCX", max_tokens_in_buffer=1024) - assert config.backend == "ucx" + assert config.backend == "UCX" assert config.max_tokens_in_buffer == 1024 # Arbitrary arguments should be rejected with pytest.raises( pydantic_core._pydantic_core.ValidationError) as exc_info: - CacheTransceiverConfig(backend="ucx", invalid_config="should_fail") + CacheTransceiverConfig(backend="UCX", invalid_config="should_fail") assert "invalid_config" in str(exc_info.value) def test_torch_compile_config_arbitrary_args(self):