mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Misc] Remove dead VLLM_RPC_TIMEOUT env var and fix profiling doc that references it (#44128)
Signed-off-by: Daoyuan Li <94409450+DaoyuanLi2816@users.noreply.github.com>
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
{
|
||||
"test_name": "latency_llama8B_tp2",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
200
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
|
||||
},
|
||||
"server_parameters": {
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
128
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp2",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
|
||||
@@ -35,8 +35,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||
|
||||
!!! tip
|
||||
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
|
||||
Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
|
||||
`export VLLM_RPC_TIMEOUT=1800000`
|
||||
The engine client waits for this flush to complete without timing out, so simply allow the stop call to run to completion.
|
||||
|
||||
### Example commands and usage
|
||||
|
||||
|
||||
@@ -95,7 +95,6 @@ if TYPE_CHECKING:
|
||||
CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
|
||||
VERBOSE: bool = False
|
||||
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
|
||||
VLLM_RPC_TIMEOUT: int = 10000 # ms
|
||||
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
||||
VLLM_MAX_N_SEQUENCES: int = 16384
|
||||
VLLM_PLUGINS: list[str] | None = None
|
||||
@@ -1015,9 +1014,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_TEST_FORCE_LOAD_FORMAT": lambda: os.getenv(
|
||||
"VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
|
||||
),
|
||||
# Time in ms for the zmq client to wait for a response from the backend
|
||||
# server for simple data operations
|
||||
"VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
|
||||
# Timeout in seconds for keeping HTTP connections alive in API server
|
||||
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
|
||||
os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
|
||||
|
||||
Reference in New Issue
Block a user