[https://nvbugs/5569754][fix] trtllm-llmapi-launch port conflict (#8582)

Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2025-10-23 09:14:35 +08:00 committed by GitHub
parent c7b06b1b0a
commit 0d929f8dc7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -24,17 +24,9 @@ function mpi_world_size {
}
function export_free_tcp_addr_for_spawn_proxy_process {
# find free port starting from 10012
local free_port=$(python -c 'import socket; s=socket.socket();
port = 10012
while True:
try:
s.bind(("", port))
break
except OSError:
port += 1
print(port); s.close()')
export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}"
# Generate unique IPC address without importing tensorrt_llm to avoid MPI initialization conflicts
local free_port=$(python3 -c "import uuid, tempfile, os; print(f'ipc://{os.path.join(tempfile.gettempdir(), \"rpc_test_\" + str(uuid.uuid4()))}')")
export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR=$free_port
log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"
export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32)
@ -44,9 +36,12 @@ print(port); s.close()')
export tllm_mpi_size=$(mpi_world_size)
log_stderr "tllm_mpi_size: $tllm_mpi_size"
export_free_tcp_addr_for_spawn_proxy_process
if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
# IPC only works on localhost and in MPI rank0 process
export_free_tcp_addr_for_spawn_proxy_process
log_stderr "Rank${mpi_rank} run ${task_with_command[@]} in background"
# MPI doesn't allow spawn a process sharing the MPI environment in a MPI