mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[cherry-pick][https://nvbugs/5670793][fix] Solve trtllm-serve launch_disaggregated issue (#9346)
Signed-off-by: xxi <xxi@nvidia.com>
This commit is contained in:
parent
a21be43677
commit
f1ed057b4c
@ -29,7 +29,7 @@ from tensorrt_llm.llmapi.disagg_utils import (DisaggClusterConfig,
|
||||
parse_disagg_config_file,
|
||||
parse_metadata_server_config_file)
|
||||
from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
|
||||
from tensorrt_llm.llmapi.mpi_session import find_free_port
|
||||
from tensorrt_llm.llmapi.mpi_session import find_free_ipc_addr
|
||||
from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory
|
||||
from tensorrt_llm.logger import logger, severity_map
|
||||
from tensorrt_llm.serve import OpenAIDisaggServer, OpenAIServer
|
||||
@ -738,10 +738,10 @@ def _launch_disaggregated_leader(sub_comm, instance_idx: int, config_file: str,
|
||||
|
||||
# This mimics the behavior of trtllm-llmapi-launch
|
||||
# TODO: Make the port allocation atomic
|
||||
free_port = find_free_port()
|
||||
free_ipc_addr = find_free_ipc_addr()
|
||||
os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS] = "1"
|
||||
os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.
|
||||
value] = f"tcp://127.0.0.1:{free_port}"
|
||||
os.environ[
|
||||
LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.value] = free_ipc_addr
|
||||
os.environ[DisaggLauncherEnvs.TLLM_DISAGG_RUN_REMOTE_MPI_SESSION_CLIENT.
|
||||
value] = "1"
|
||||
os.environ[DisaggLauncherEnvs.TLLM_DISAGG_INSTANCE_IDX] = str(instance_idx)
|
||||
|
||||
@ -541,6 +541,13 @@ def find_free_port() -> int:
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def find_free_ipc_addr() -> str:
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
return f'ipc://{os.path.join(tempfile.gettempdir(), "rpc_" + str(uuid.uuid4()))}'
|
||||
|
||||
|
||||
def get_mpi_world_size() -> int:
|
||||
# avoid cyclic import
|
||||
from ..executor.utils import get_spawn_proxy_process_env
|
||||
|
||||
Loading…
Reference in New Issue
Block a user