diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index f81f0ab2bd..f5fed7b0eb 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -32,6 +32,7 @@ import contextlib import json import os import re +import socket import subprocess import tempfile import threading @@ -44,7 +45,7 @@ import pandas as pd import pytest import requests import yaml -from defs.common import parse_gsm8k_output +from defs.common import get_free_port_in_ci, parse_gsm8k_output from defs.conftest import get_device_count, get_device_memory, llm_models_root from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info, print_warning) @@ -72,10 +73,18 @@ from defs.trt_test_alternative import (Popen, cleanup_process_tree, print_info, GRACEFUL_TERMINATION_TIMEOUT = 300 # seconds - set longer when stress large model +def _get_default_port() -> int: + """Get a default port using CI allocation if available, otherwise use 8000.""" + try: + return get_free_port_in_ci() + except Exception: + return 8000 + + @dataclass(frozen=True) class ServerConfig: """Dataclass to store server configuration for trtllm-serve""" - port: int = 8000 + port: int = field(default_factory=_get_default_port) host: str = "localhost" pp_size: int = 1 ep_size: Optional[int] = 1 @@ -167,8 +176,7 @@ class PerformanceParams: # Ensure indefinite runs specially for different concurrency values test_timeout: int = 3600 # 1 hours for tinyllama and llama-v3-8b-instruct-hf concurrency_list: List[int] = field( - default_factory=lambda: - [8, 16, 32, 64, 128, 256, 384, 512, 640, 768, 896, 1024]) + default_factory=lambda: [8, 16, 32, 64, 128, 256]) @property def request_count_list(self) -> List[int]: @@ -341,6 +349,26 @@ def check_server_health(server_url: str, return False, f"Unexpected error during health check: {str(e)}" +def is_port_available(port: int, + host: str = "localhost") -> Tuple[bool, Optional[str]]: + """ + Check if a port is available for binding. + + Args: + port: Port number to check + host: Host to bind to + + Returns: + Tuple of (is_available, error_message) + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.bind((host, port)) + return True, None + except OSError as e: + return False, f"Port {port} is already in use on {host}: {e}" + + @pytest.mark.parametrize( "test_mode", ["stress-test", "stress-stage-alone", "stress-test-with-accuracy"], @@ -519,6 +547,10 @@ def stress_test(config, else: stress_config = None + # Check if port is available + is_available, port_error = is_port_available(test_server_config.port, + test_server_config.host) + # Check if server is already running is_healthy, _ = check_server_health(test_server_config.url, test_server_config.health_check_timeout) @@ -530,6 +562,9 @@ def stress_test(config, # Start server print_info("Starting trtllm-serve server...") print_info(f"Model path: {model_path}") + print_info( + f"Server port: {test_server_config.port} (allocated via CI port mechanism)" + ) # Verify that model path exists if not os.path.exists(model_path): @@ -552,7 +587,7 @@ def stress_test(config, extra_llm_options.update({ "cuda_graph_config": { "enable_padding": True, - "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], + "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128], }, "print_iter_log": True, }) @@ -759,6 +794,7 @@ def create_aiperf_command(model_name, model_path, request_count, concurrency, + server_url, input_len_mean=PerformanceParams.input_len_mean, input_len_std=PerformanceParams.input_len_std, output_len_mean=PerformanceParams.output_len_mean, @@ -772,6 +808,7 @@ def create_aiperf_command(model_name, model_path: Path to the model request_count: Number of requests to send concurrency: Number of concurrent requests + server_url: URL of the server (e.g., "localhost:8000") input_len_mean: Mean input length input_len_std: Standard deviation of input length output_len_mean: Mean output length @@ -790,6 +827,8 @@ def create_aiperf_command(model_name, model_path, "--endpoint-type", "completions", + "-u", + server_url, "--random-seed", "123", "--synthetic-input-tokens-mean", @@ -928,6 +967,7 @@ def measure_capacity_stage(model_name, model_path=model_path, request_count=request_count, concurrency=concurrency, + server_url=f"{server_config.host}:{server_config.port}", input_len_mean=performance_params.input_len_mean, input_len_std=performance_params.input_len_std, output_len_mean=performance_params.output_len_mean, @@ -1023,6 +1063,7 @@ def stress_stage(model_name, model_path=model_path, request_count=request_count, concurrency=stress_concurrency, + server_url=f"{server_config.host}:{server_config.port}", input_len_mean=PerformanceParams.input_len_mean, input_len_std=PerformanceParams.input_len_std, output_len_mean=PerformanceParams.output_len_mean, diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index c8dc811a37..3d8228b479 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -242,8 +242,8 @@ l0_a10: - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] # 5 mins - accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype - accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype - - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test] - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-trt-stress-test] + - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-trt-stress-test] - test_e2e.py::test_gpt3_175b_1layers_build_only # 6 mins - examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only] - unittest/trt/model/test_mamba.py # 3 mins @@ -263,8 +263,8 @@ l0_a10: stage: post_merge backend: pytorch tests: - - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] + - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 08ae4a8399..8f661504ec 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -340,7 +340,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mt accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5814309) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646) -stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5814203) +unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5814215) full:sm89/accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5814504) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5819005) unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (https://nvbugs/5819014) @@ -384,6 +384,8 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwel perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604) disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212) +examples/test_ray.py::test_llm_inference_distributed_ray[tp2pp2] SKIP (https://nvbugs/5781731) +test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444) accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646) accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)