[https://nvbugs/5814203][fix] Fix port 8000 being used issue in stress test. (#10756)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
2026-02-05 02:31:33 +08:00 · 2026-01-21 13:39:32 +08:00 · 2026-01-21 13:39:32 +08:00 · c98c286c0f
commit c98c286c0f
parent ae58a7ed20
3 changed files with 51 additions and 8 deletions
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@ -32,6 +32,7 @@ import contextlib
 import json
 import os
 import re
+import socket
 import subprocess
 import tempfile
 import threading
@ -44,7 +45,7 @@ import pandas as pd
 import pytest
 import requests
 import yaml
-from defs.common import parse_gsm8k_output
+from defs.common import get_free_port_in_ci, parse_gsm8k_output
 from defs.conftest import get_device_count, get_device_memory, llm_models_root
 from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
                                       print_warning)
@ -72,10 +73,18 @@ from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info,
 GRACEFUL_TERMINATION_TIMEOUT = 300  # seconds - set longer when stress large model


+def _get_default_port() -> int:
+    """Get a default port using CI allocation if available, otherwise use 8000."""
+    try:
+        return get_free_port_in_ci()
+    except Exception:
+        return 8000
+
+
@dataclass(frozen=True)
 class ServerConfig:
    """Dataclass to store server configuration for trtllm-serve"""
-    port: int = 8000
+    port: int = field(default_factory=_get_default_port)
    host: str = "localhost"
    pp_size: int = 1
    ep_size: Optional[int] = 1
@ -167,8 +176,7 @@ class PerformanceParams:
    # Ensure indefinite runs specially for different concurrency values
    test_timeout: int = 3600  # 1 hours for tinyllama and llama-v3-8b-instruct-hf
    concurrency_list: List[int] = field(
-        default_factory=lambda:
-        [8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024])
+        default_factory=lambda: [8, 16, 32, 64, 128, 256])

    @property
    def request_count_list(self) -> List[int]:
@ -341,6 +349,26 @@ def check_server_health(server_url: str,
        return False, f"Unexpected error during health check: {str(e)}"


+def is_port_available(port: int,
+                      host: str = "localhost") -> Tuple[bool, Optional[str]]:
+    """
+    Check if a port is available for binding.
+
+    Args:
+        port: Port number to check
+        host: Host to bind to
+
+    Returns:
+        Tuple of (is_available, error_message)
+    """
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        try:
+            s.bind((host, port))
+            return True, None
+        except OSError as e:
+            return False, f"Port {port} is already in use on {host}: {e}"
+
+
@pytest.mark.parametrize(
    "test_mode",
    ["stress-test", "stress-stage-alone", "stress-test-with-accuracy"],
@ -519,6 +547,10 @@ def stress_test(config,
    else:
        stress_config = None

+    # Check if port is available
+    is_available, port_error = is_port_available(test_server_config.port,
+                                                 test_server_config.host)
+
    # Check if server is already running
    is_healthy, _ = check_server_health(test_server_config.url,
                                        test_server_config.health_check_timeout)
@ -530,6 +562,9 @@ def stress_test(config,
    # Start server
    print_info("Starting trtllm-serve server...")
    print_info(f"Model path: {model_path}")
+    print_info(
+        f"Server port: {test_server_config.port} (allocated via CI port mechanism)"
+    )

    # Verify that model path exists
    if not os.path.exists(model_path):
@ -552,7 +587,7 @@ def stress_test(config,
            extra_llm_options.update({
                "cuda_graph_config": {
                    "enable_padding": True,
-                    "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
+                    "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
                },
                "print_iter_log": True,
            })
@ -759,6 +794,7 @@ def create_aiperf_command(model_name,
                          model_path,
                          request_count,
                          concurrency,
+                          server_url,
                          input_len_mean=PerformanceParams.input_len_mean,
                          input_len_std=PerformanceParams.input_len_std,
                          output_len_mean=PerformanceParams.output_len_mean,
@ -772,6 +808,7 @@ def create_aiperf_command(model_name,
        model_path: Path to the model
        request_count: Number of requests to send
        concurrency: Number of concurrent requests
+        server_url: URL of the server (e.g., "localhost:8000")
        input_len_mean: Mean input length
        input_len_std: Standard deviation of input length
        output_len_mean: Mean output length
@ -790,6 +827,8 @@ def create_aiperf_command(model_name,
        model_path,
        "--endpoint-type",
        "completions",
+        "-u",
+        server_url,
        "--random-seed",
        "123",
        "--synthetic-input-tokens-mean",
@ -928,6 +967,7 @@ def measure_capacity_stage(model_name,
            model_path=model_path,
            request_count=request_count,
            concurrency=concurrency,
+            server_url=f"{server_config.host}:{server_config.port}",
            input_len_mean=performance_params.input_len_mean,
            input_len_std=performance_params.input_len_std,
            output_len_mean=performance_params.output_len_mean,
@ -1023,6 +1063,7 @@ def stress_stage(model_name,
        model_path=model_path,
        request_count=request_count,
        concurrency=stress_concurrency,
+        server_url=f"{server_config.host}:{server_config.port}",
        input_len_mean=PerformanceParams.input_len_mean,
        input_len_std=PerformanceParams.input_len_std,
        output_len_mean=PerformanceParams.output_len_mean,
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@ -242,8 +242,8 @@ l0_a10:
  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] # 5 mins
  - accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
  - accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
-  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test]
  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-trt-stress-test]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test]
  - test_e2e.py::test_gpt3_175b_1layers_build_only # 6 mins
  - examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]
  - unittest/trt/model/test_mamba.py # 3 mins
@ -263,8 +263,8 @@ l0_a10:
      stage: post_merge
      backend: pytorch
  tests:
-  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
 - condition:
    ranges:
      system_gpu_count:
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -340,7 +340,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mt
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5814309)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646)
-stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5814203)
+unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5814215)
 full:sm89/accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5814504)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5819005)
 unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (https://nvbugs/5819014)
@ -384,6 +384,8 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwel
 perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
 disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
+examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] SKIP (https://nvbugs/5781731)
+test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444)
 accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646)
 accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)