[https://nvbugs/5814203][fix] Fix port 8000 being used issue in stress test. (#10756)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
dominicshanshan 2026-01-21 13:39:32 +08:00 committed by Yanchao Lu
parent ae58a7ed20
commit c98c286c0f
3 changed files with 51 additions and 8 deletions

View File

@ -32,6 +32,7 @@ import contextlib
import json
import os
import re
import socket
import subprocess
import tempfile
import threading
@ -44,7 +45,7 @@ import pandas as pd
import pytest
import requests
import yaml
from defs.common import parse_gsm8k_output
from defs.common import get_free_port_in_ci, parse_gsm8k_output
from defs.conftest import get_device_count, get_device_memory, llm_models_root
from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
print_warning)
@ -72,10 +73,18 @@ from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
GRACEFUL_TERMINATION_TIMEOUT = 300 # seconds - set longer when stress large model
def _get_default_port() -> int:
"""Get a default port using CI allocation if available, otherwise use 8000."""
try:
return get_free_port_in_ci()
except Exception:
return 8000
@dataclass(frozen=True)
class ServerConfig:
"""Dataclass to store server configuration for trtllm-serve"""
port: int = 8000
port: int = field(default_factory=_get_default_port)
host: str = "localhost"
pp_size: int = 1
ep_size: Optional[int] = 1
@ -167,8 +176,7 @@ class PerformanceParams:
# Ensure indefinite runs specially for different concurrency values
test_timeout: int = 3600 # 1 hours for tinyllama and llama-v3-8b-instruct-hf
concurrency_list: List[int] = field(
default_factory=lambda:
[8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024])
default_factory=lambda: [8, 16, 32, 64, 128, 256])
@property
def request_count_list(self) -> List[int]:
@ -341,6 +349,26 @@ def check_server_health(server_url: str,
return False, f"Unexpected error during health check: {str(e)}"
def is_port_available(port: int,
host: str = "localhost") -> Tuple[bool, Optional[str]]:
"""
Check if a port is available for binding.
Args:
port: Port number to check
host: Host to bind to
Returns:
Tuple of (is_available, error_message)
"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind((host, port))
return True, None
except OSError as e:
return False, f"Port {port} is already in use on {host}: {e}"
@pytest.mark.parametrize(
"test_mode",
["stress-test", "stress-stage-alone", "stress-test-with-accuracy"],
@ -519,6 +547,10 @@ def stress_test(config,
else:
stress_config = None
# Check if port is available
is_available, port_error = is_port_available(test_server_config.port,
test_server_config.host)
# Check if server is already running
is_healthy, _ = check_server_health(test_server_config.url,
test_server_config.health_check_timeout)
@ -530,6 +562,9 @@ def stress_test(config,
# Start server
print_info("Starting trtllm-serve server...")
print_info(f"Model path: {model_path}")
print_info(
f"Server port: {test_server_config.port} (allocated via CI port mechanism)"
)
# Verify that model path exists
if not os.path.exists(model_path):
@ -552,7 +587,7 @@ def stress_test(config,
extra_llm_options.update({
"cuda_graph_config": {
"enable_padding": True,
"batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
"batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
},
"print_iter_log": True,
})
@ -759,6 +794,7 @@ def create_aiperf_command(model_name,
model_path,
request_count,
concurrency,
server_url,
input_len_mean=PerformanceParams.input_len_mean,
input_len_std=PerformanceParams.input_len_std,
output_len_mean=PerformanceParams.output_len_mean,
@ -772,6 +808,7 @@ def create_aiperf_command(model_name,
model_path: Path to the model
request_count: Number of requests to send
concurrency: Number of concurrent requests
server_url: URL of the server (e.g., "localhost:8000")
input_len_mean: Mean input length
input_len_std: Standard deviation of input length
output_len_mean: Mean output length
@ -790,6 +827,8 @@ def create_aiperf_command(model_name,
model_path,
"--endpoint-type",
"completions",
"-u",
server_url,
"--random-seed",
"123",
"--synthetic-input-tokens-mean",
@ -928,6 +967,7 @@ def measure_capacity_stage(model_name,
model_path=model_path,
request_count=request_count,
concurrency=concurrency,
server_url=f"{server_config.host}:{server_config.port}",
input_len_mean=performance_params.input_len_mean,
input_len_std=performance_params.input_len_std,
output_len_mean=performance_params.output_len_mean,
@ -1023,6 +1063,7 @@ def stress_stage(model_name,
model_path=model_path,
request_count=request_count,
concurrency=stress_concurrency,
server_url=f"{server_config.host}:{server_config.port}",
input_len_mean=PerformanceParams.input_len_mean,
input_len_std=PerformanceParams.input_len_std,
output_len_mean=PerformanceParams.output_len_mean,

View File

@ -242,8 +242,8 @@ l0_a10:
- accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] # 5 mins
- accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
- accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-trt-stress-test]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test]
- test_e2e.py::test_gpt3_175b_1layers_build_only # 6 mins
- examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]
- unittest/trt/model/test_mamba.py # 3 mins
@ -263,8 +263,8 @@ l0_a10:
stage: post_merge
backend: pytorch
tests:
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
- condition:
ranges:
system_gpu_count:

View File

@ -340,7 +340,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mt
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5814309)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646)
stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5814203)
unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5814215)
full:sm89/accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5814504)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5819005)
unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (https://nvbugs/5819014)
@ -384,6 +384,8 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwel
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] SKIP (https://nvbugs/5781731)
test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444)
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646)
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)