[cherry-pick][https://nvbugs/5670793][fix] Solve trtllm-serve launch_disaggregated issue (#9346)

Signed-off-by: xxi <xxi@nvidia.com>
This commit is contained in:
xxi 2025-11-27 16:13:58 +08:00 committed by GitHub
parent a21be43677
commit f1ed057b4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 4 deletions

View File

@ -29,7 +29,7 @@ from tensorrt_llm.llmapi.disagg_utils import (DisaggClusterConfig,
parse_disagg_config_file,
parse_metadata_server_config_file)
from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
from tensorrt_llm.llmapi.mpi_session import find_free_port
from tensorrt_llm.llmapi.mpi_session import find_free_ipc_addr
from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory
from tensorrt_llm.logger import logger, severity_map
from tensorrt_llm.serve import OpenAIDisaggServer, OpenAIServer
@ -738,10 +738,10 @@ def _launch_disaggregated_leader(sub_comm, instance_idx: int, config_file: str,
# This mimics the behavior of trtllm-llmapi-launch
# TODO: Make the port allocation atomic
free_port = find_free_port()
free_ipc_addr = find_free_ipc_addr()
os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS] = "1"
os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.
value] = f"tcp://127.0.0.1:{free_port}"
os.environ[
LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.value] = free_ipc_addr
os.environ[DisaggLauncherEnvs.TLLM_DISAGG_RUN_REMOTE_MPI_SESSION_CLIENT.
value] = "1"
os.environ[DisaggLauncherEnvs.TLLM_DISAGG_INSTANCE_IDX] = str(instance_idx)

View File

@ -541,6 +541,13 @@ def find_free_port() -> int:
return s.getsockname()[1]
def find_free_ipc_addr() -> str:
import os
import tempfile
import uuid
return f'ipc://{os.path.join(tempfile.gettempdir(), "rpc_" + str(uuid.uuid4()))}'
def get_mpi_world_size() -> int:
# avoid cyclic import
from ..executor.utils import get_spawn_proxy_process_env