diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json index fba695041e3..98811049d39 100644 --- a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json +++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json @@ -2,7 +2,6 @@ { "test_name": "latency_llama8B_tp1", "environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_KVCACHE_SPACE": 40 diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json index 77d1694ec86..5f048df5f6a 100644 --- a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json @@ -2,7 +2,6 @@ { "test_name": "latency_llama8B_tp2", "environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json index 9f226ef2f81..75b80b2c212 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json @@ -13,7 +13,6 @@ 200 ], "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json index 30879b5e9dc..16e8b0600ac 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json @@ -5,7 +5,6 @@ ], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120 }, "server_parameters": { diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json index 6d3455c478c..c62f244fc76 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json @@ -9,7 +9,6 @@ 128 ], "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json index 34c2cc82d39..9aa76c11089 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json @@ -5,7 +5,6 @@ ], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index c2d7768e202..0b7e7499965 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -5,7 +5,6 @@ ], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json index da84dd4d0c6..3863cccf43a 100644 --- a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json @@ -2,7 +2,6 @@ { "test_name": "throughput_llama8B_tp1", "environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_KVCACHE_SPACE": 40 diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json index dc214ddfb27..d3f16eff116 100644 --- a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json @@ -2,7 +2,6 @@ { "test_name": "throughput_llama8B_tp2", "environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, "VLLM_CPU_SGL_KERNEL": 1, diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index ce46445a983..c9bd0e5bdd9 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -35,8 +35,7 @@ Traces can be visualized using . !!! tip To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. - Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. - `export VLLM_RPC_TIMEOUT=1800000` + The engine client waits for this flush to complete without timing out, so simply allow the stop call to run to completion. ### Example commands and usage diff --git a/vllm/envs.py b/vllm/envs.py index c12e3cae247..dc11fbd224d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -95,7 +95,6 @@ if TYPE_CHECKING: CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False - VLLM_RPC_TIMEOUT: int = 10000 # ms VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_MAX_N_SEQUENCES: int = 16384 VLLM_PLUGINS: list[str] | None = None @@ -1015,9 +1014,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: os.getenv( "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy" ), - # Time in ms for the zmq client to wait for a response from the backend - # server for simple data operations - "VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")), # Timeout in seconds for keeping HTTP connections alive in API server "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int( os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")