[Misc] Remove dead VLLM_RPC_TIMEOUT env var and fix profiling doc that references it (#44128)

Signed-off-by: Daoyuan Li <94409450+DaoyuanLi2816@users.noreply.github.com>
2026-06-06 00:16:14 +00:00 · 2026-06-02 17:22:10 -07:00
parent a4ac746405
commit bd98e97557
11 changed files with 1 additions and 15 deletions
@@ -2,7 +2,6 @@
    {
        "test_name": "latency_llama8B_tp1",
        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
            "VLLM_CPU_KVCACHE_SPACE": 40
@@ -2,7 +2,6 @@
    {
        "test_name": "latency_llama8B_tp2",
        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
@@ -13,7 +13,6 @@
      200
    ],
    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
@@ -5,7 +5,6 @@
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
    },
    "server_parameters": {
@@ -9,7 +9,6 @@
      128
    ],
    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
@@ -5,7 +5,6 @@
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
@@ -5,7 +5,6 @@
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
@@ -2,7 +2,6 @@
    {
        "test_name": "throughput_llama8B_tp1",
        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
            "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
            "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
            "VLLM_CPU_KVCACHE_SPACE": 40
@@ -2,7 +2,6 @@
    {
        "test_name": "throughput_llama8B_tp2",
        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
@@ -35,8 +35,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.

 !!! tip
    To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-    Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
-    `export VLLM_RPC_TIMEOUT=1800000`
+    The engine client waits for this flush to complete without timing out, so simply allow the stop call to run to completion.

 ### Example commands and usage

@@ -95,7 +95,6 @@ if TYPE_CHECKING:
    CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
    VERBOSE: bool = False
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
-    VLLM_RPC_TIMEOUT: int = 10000  # ms
    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
    VLLM_MAX_N_SEQUENCES: int = 16384
    VLLM_PLUGINS: list[str] | None = None
@@ -1015,9 +1014,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: os.getenv(
        "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
    ),
-    # Time in ms for the zmq client to wait for a response from the backend
-    # server for simple data operations
-    "VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
    # Timeout in seconds for keeping HTTP connections alive in API server
    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
        os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")