mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-05 02:31:33 +08:00
[https://nvbugs/5814203][fix] Fix port 8000 being used issue in stress test. (#10756)
Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
parent
ae58a7ed20
commit
c98c286c0f
@ -32,6 +32,7 @@ import contextlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
@ -44,7 +45,7 @@ import pandas as pd
|
||||
import pytest
|
||||
import requests
|
||||
import yaml
|
||||
from defs.common import parse_gsm8k_output
|
||||
from defs.common import get_free_port_in_ci, parse_gsm8k_output
|
||||
from defs.conftest import get_device_count, get_device_memory, llm_models_root
|
||||
from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
|
||||
print_warning)
|
||||
@ -72,10 +73,18 @@ from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
|
||||
GRACEFUL_TERMINATION_TIMEOUT = 300 # seconds - set longer when stress large model
|
||||
|
||||
|
||||
def _get_default_port() -> int:
|
||||
"""Get a default port using CI allocation if available, otherwise use 8000."""
|
||||
try:
|
||||
return get_free_port_in_ci()
|
||||
except Exception:
|
||||
return 8000
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServerConfig:
|
||||
"""Dataclass to store server configuration for trtllm-serve"""
|
||||
port: int = 8000
|
||||
port: int = field(default_factory=_get_default_port)
|
||||
host: str = "localhost"
|
||||
pp_size: int = 1
|
||||
ep_size: Optional[int] = 1
|
||||
@ -167,8 +176,7 @@ class PerformanceParams:
|
||||
# Ensure indefinite runs specially for different concurrency values
|
||||
test_timeout: int = 3600 # 1 hours for tinyllama and llama-v3-8b-instruct-hf
|
||||
concurrency_list: List[int] = field(
|
||||
default_factory=lambda:
|
||||
[8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024])
|
||||
default_factory=lambda: [8, 16, 32, 64, 128, 256])
|
||||
|
||||
@property
|
||||
def request_count_list(self) -> List[int]:
|
||||
@ -341,6 +349,26 @@ def check_server_health(server_url: str,
|
||||
return False, f"Unexpected error during health check: {str(e)}"
|
||||
|
||||
|
||||
def is_port_available(port: int,
|
||||
host: str = "localhost") -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Check if a port is available for binding.
|
||||
|
||||
Args:
|
||||
port: Port number to check
|
||||
host: Host to bind to
|
||||
|
||||
Returns:
|
||||
Tuple of (is_available, error_message)
|
||||
"""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
try:
|
||||
s.bind((host, port))
|
||||
return True, None
|
||||
except OSError as e:
|
||||
return False, f"Port {port} is already in use on {host}: {e}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"test_mode",
|
||||
["stress-test", "stress-stage-alone", "stress-test-with-accuracy"],
|
||||
@ -519,6 +547,10 @@ def stress_test(config,
|
||||
else:
|
||||
stress_config = None
|
||||
|
||||
# Check if port is available
|
||||
is_available, port_error = is_port_available(test_server_config.port,
|
||||
test_server_config.host)
|
||||
|
||||
# Check if server is already running
|
||||
is_healthy, _ = check_server_health(test_server_config.url,
|
||||
test_server_config.health_check_timeout)
|
||||
@ -530,6 +562,9 @@ def stress_test(config,
|
||||
# Start server
|
||||
print_info("Starting trtllm-serve server...")
|
||||
print_info(f"Model path: {model_path}")
|
||||
print_info(
|
||||
f"Server port: {test_server_config.port} (allocated via CI port mechanism)"
|
||||
)
|
||||
|
||||
# Verify that model path exists
|
||||
if not os.path.exists(model_path):
|
||||
@ -552,7 +587,7 @@ def stress_test(config,
|
||||
extra_llm_options.update({
|
||||
"cuda_graph_config": {
|
||||
"enable_padding": True,
|
||||
"batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
|
||||
"batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
|
||||
},
|
||||
"print_iter_log": True,
|
||||
})
|
||||
@ -759,6 +794,7 @@ def create_aiperf_command(model_name,
|
||||
model_path,
|
||||
request_count,
|
||||
concurrency,
|
||||
server_url,
|
||||
input_len_mean=PerformanceParams.input_len_mean,
|
||||
input_len_std=PerformanceParams.input_len_std,
|
||||
output_len_mean=PerformanceParams.output_len_mean,
|
||||
@ -772,6 +808,7 @@ def create_aiperf_command(model_name,
|
||||
model_path: Path to the model
|
||||
request_count: Number of requests to send
|
||||
concurrency: Number of concurrent requests
|
||||
server_url: URL of the server (e.g., "localhost:8000")
|
||||
input_len_mean: Mean input length
|
||||
input_len_std: Standard deviation of input length
|
||||
output_len_mean: Mean output length
|
||||
@ -790,6 +827,8 @@ def create_aiperf_command(model_name,
|
||||
model_path,
|
||||
"--endpoint-type",
|
||||
"completions",
|
||||
"-u",
|
||||
server_url,
|
||||
"--random-seed",
|
||||
"123",
|
||||
"--synthetic-input-tokens-mean",
|
||||
@ -928,6 +967,7 @@ def measure_capacity_stage(model_name,
|
||||
model_path=model_path,
|
||||
request_count=request_count,
|
||||
concurrency=concurrency,
|
||||
server_url=f"{server_config.host}:{server_config.port}",
|
||||
input_len_mean=performance_params.input_len_mean,
|
||||
input_len_std=performance_params.input_len_std,
|
||||
output_len_mean=performance_params.output_len_mean,
|
||||
@ -1023,6 +1063,7 @@ def stress_stage(model_name,
|
||||
model_path=model_path,
|
||||
request_count=request_count,
|
||||
concurrency=stress_concurrency,
|
||||
server_url=f"{server_config.host}:{server_config.port}",
|
||||
input_len_mean=PerformanceParams.input_len_mean,
|
||||
input_len_std=PerformanceParams.input_len_std,
|
||||
output_len_mean=PerformanceParams.output_len_mean,
|
||||
|
||||
@ -242,8 +242,8 @@ l0_a10:
|
||||
- accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] # 5 mins
|
||||
- accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
|
||||
- accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
|
||||
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test]
|
||||
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-trt-stress-test]
|
||||
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test]
|
||||
- test_e2e.py::test_gpt3_175b_1layers_build_only # 6 mins
|
||||
- examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]
|
||||
- unittest/trt/model/test_mamba.py # 3 mins
|
||||
@ -263,8 +263,8 @@ l0_a10:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
|
||||
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
|
||||
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -340,7 +340,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mt
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5814309)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646)
|
||||
stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5814203)
|
||||
unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5814215)
|
||||
full:sm89/accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5814504)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5819005)
|
||||
unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (https://nvbugs/5819014)
|
||||
@ -384,6 +384,8 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwel
|
||||
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
|
||||
examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] SKIP (https://nvbugs/5781731)
|
||||
test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444)
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646)
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user