[Misc] Remove dead VLLM_RPC_TIMEOUT env var and fix profiling doc that references it (#44128)

Signed-off-by: Daoyuan Li <94409450+DaoyuanLi2816@users.noreply.github.com>
This commit is contained in:
Daoyuan Li
2026-06-02 17:22:10 -07:00
committed by GitHub
parent a4ac746405
commit bd98e97557
11 changed files with 1 additions and 15 deletions
@@ -2,7 +2,6 @@
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
@@ -2,7 +2,6 @@
{
"test_name": "latency_llama8B_tp2",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
@@ -13,7 +13,6 @@
200
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
@@ -5,7 +5,6 @@
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
@@ -9,7 +9,6 @@
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
@@ -5,7 +5,6 @@
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
@@ -5,7 +5,6 @@
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
@@ -2,7 +2,6 @@
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
@@ -2,7 +2,6 @@
{
"test_name": "throughput_llama8B_tp2",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
+1 -2
View File
@@ -35,8 +35,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
!!! tip
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
`export VLLM_RPC_TIMEOUT=1800000`
The engine client waits for this flush to complete without timing out, so simply allow the stop call to run to completion.
### Example commands and usage
-4
View File
@@ -95,7 +95,6 @@ if TYPE_CHECKING:
CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_RPC_TIMEOUT: int = 10000 # ms
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
VLLM_MAX_N_SEQUENCES: int = 16384
VLLM_PLUGINS: list[str] | None = None
@@ -1015,9 +1014,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TEST_FORCE_LOAD_FORMAT": lambda: os.getenv(
"VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
),
# Time in ms for the zmq client to wait for a response from the backend
# server for simple data operations
"VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
# Timeout in seconds for keeping HTTP connections alive in API server
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")