mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5569754][fix] trtllm-llmapi-launch port conflict (#8582)
Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
c7b06b1b0a
commit
0d929f8dc7
@ -24,17 +24,9 @@ function mpi_world_size {
|
||||
}
|
||||
|
||||
function export_free_tcp_addr_for_spawn_proxy_process {
|
||||
# find free port starting from 10012
|
||||
local free_port=$(python -c 'import socket; s=socket.socket();
|
||||
port = 10012
|
||||
while True:
|
||||
try:
|
||||
s.bind(("", port))
|
||||
break
|
||||
except OSError:
|
||||
port += 1
|
||||
print(port); s.close()')
|
||||
export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}"
|
||||
# Generate unique IPC address without importing tensorrt_llm to avoid MPI initialization conflicts
|
||||
local free_port=$(python3 -c "import uuid, tempfile, os; print(f'ipc://{os.path.join(tempfile.gettempdir(), \"rpc_test_\" + str(uuid.uuid4()))}')")
|
||||
export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR=$free_port
|
||||
log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"
|
||||
|
||||
export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32)
|
||||
@ -44,9 +36,12 @@ print(port); s.close()')
|
||||
export tllm_mpi_size=$(mpi_world_size)
|
||||
log_stderr "tllm_mpi_size: $tllm_mpi_size"
|
||||
|
||||
export_free_tcp_addr_for_spawn_proxy_process
|
||||
|
||||
if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
|
||||
|
||||
# IPC only works on localhost and in MPI rank0 process
|
||||
export_free_tcp_addr_for_spawn_proxy_process
|
||||
|
||||
log_stderr "Rank${mpi_rank} run ${task_with_command[@]} in background"
|
||||
|
||||
# MPI doesn't allow spawn a process sharing the MPI environment in a MPI
|
||||
|
||||
Loading…
Reference in New Issue
Block a user