""" End-to-End Negative Tests for trtllm-serve These tests verify that trtllm-serve handles error conditions gracefully: - Invalid inputs and malformed requests - Server stability under stress with invalid requests - Proper error responses and status codes - Recovery after encountering errors """ import asyncio import time from pathlib import Path import openai import pytest import requests from defs.conftest import llm_models_root from defs.trt_test_alternative import popen, print_error, print_info from tensorrt_llm._utils import get_free_port class RemoteOpenAIServer: DUMMY_API_KEY = "tensorrt_llm" def __init__(self, host: str, port: int) -> None: self.host = host self.port = port @property def url_root(self) -> str: return f"http://{self.host}:{self.port}" def url_for(self, *parts: str) -> str: return self.url_root + "/" + "/".join(parts) def get_async_client(self, **kwargs): return openai.AsyncOpenAI(base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, **kwargs) @pytest.fixture(scope="module") def model_name(): """Use TinyLlama for faster testing""" return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" @pytest.fixture(scope="module") def model_path(model_name): """Get the full model path""" return str(Path(llm_models_root()) / model_name) @pytest.fixture(scope="module") def server(model_path): """Start a test server for the module using popen like test_serve.py""" host_bind = "0.0.0.0" client_host = "localhost" port = get_free_port() cmd = [ "trtllm-serve", "serve", model_path, "--host", host_bind, "--port", str(port), "--backend", "pytorch", ] def _wait_until_ready(timeout_secs: int = 600, interval: float = 0.5): start = time.time() health_url = f"http://{client_host}:{port}/health" while True: try: if requests.get(health_url, timeout=2).status_code == 200: break except Exception: pass if time.time() - start > timeout_secs: raise TimeoutError("Error: trtllm-serve health check timed out") time.sleep(interval) print_info("Launching trtllm-serve (negative tests)...") with popen(cmd): _wait_until_ready() yield RemoteOpenAIServer(client_host, port) @pytest.fixture def async_client(server: RemoteOpenAIServer): """Get an async OpenAI client""" return server.get_async_client() @pytest.mark.asyncio async def test_invalid_max_tokens(async_client: openai.AsyncOpenAI, model_name: str): """Test that server rejects invalid max_tokens value.""" print_info("Testing invalid max_tokens parameter: 0") with pytest.raises(openai.BadRequestError) as exc_info: await async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": "Hello" }], max_tokens=0, ) error_msg = str(exc_info.value).lower() assert any(keyword in error_msg for keyword in ("max_tokens", "must be greater than 0")) @pytest.mark.asyncio async def test_invalid_temperature(async_client: openai.AsyncOpenAI, model_name: str): """Test that server rejects invalid temperature value.""" print_info("Testing invalid temperature parameter: -0.5") with pytest.raises(openai.BadRequestError) as exc_info: await async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": "Hello" }], temperature=-0.5, ) assert "temperature" in str(exc_info.value).lower() or "invalid" in str( exc_info.value).lower() @pytest.mark.parametrize("top_p_value", [-0.1, 1.1]) @pytest.mark.asyncio async def test_invalid_top_p(async_client: openai.AsyncOpenAI, model_name: str, top_p_value: float): """Test that server rejects invalid top_p values.""" print_info(f"Testing invalid top_p parameter: {top_p_value}") with pytest.raises(openai.BadRequestError) as exc_info: await async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": "Hello" }], top_p=top_p_value, ) assert "top_p" in str(exc_info.value).lower() or "invalid" in str( exc_info.value).lower() @pytest.mark.asyncio async def test_empty_messages_array(async_client: openai.AsyncOpenAI, model_name: str): """Test that server rejects empty messages array.""" print_info("Testing empty messages array...") with pytest.raises(openai.BadRequestError) as exc_info: await async_client.chat.completions.create(model=model_name, messages=[], max_tokens=10) assert "message" in str(exc_info.value).lower() or "empty" in str( exc_info.value).lower() @pytest.mark.asyncio async def test_missing_message_role(async_client: openai.AsyncOpenAI, model_name: str): """Test that server rejects messages without role field.""" print_info("Testing missing message role...") with pytest.raises(openai.BadRequestError) as exc_info: await async_client.chat.completions.create( model=model_name, messages=[{ "content": "Hello" }], # Missing 'role' max_tokens=10) assert "role" in str(exc_info.value).lower() @pytest.mark.asyncio async def test_invalid_token_ids(async_client: openai.AsyncOpenAI, model_name: str): """Test that server handles invalid token IDs in prompt.""" print_info("Testing invalid token IDs...") # Test negative token ID with pytest.raises((openai.BadRequestError, openai.APIError)) as exc_info: await async_client.completions.create( model=model_name, prompt=[1, 2, 3, -1, 5], # Invalid token ID: -1 max_tokens=5) error_msg = str(exc_info.value).lower() assert "token" in error_msg or "invalid" in error_msg or "range" in error_msg @pytest.mark.asyncio async def test_extremely_large_token_id(async_client: openai.AsyncOpenAI, model_name: str): """Test that server handles token IDs exceeding vocabulary size.""" print_info("Testing extremely large token ID...") # Test token ID beyond typical vocabulary size with pytest.raises((openai.BadRequestError, openai.APIError)) as exc_info: await async_client.completions.create( model=model_name, prompt=[1, 2, 3, 999999], # Token ID far beyond vocab size max_tokens=5) error_msg = str(exc_info.value).lower() assert "token" in error_msg or "range" in error_msg or "vocabulary" in error_msg or "vocab" in error_msg @pytest.mark.asyncio async def test_server_stability_under_invalid_requests( server: RemoteOpenAIServer, model_name: str): """ E2E Test: Verify server remains stable after receiving many invalid requests Test flow: 1. Send valid request to verify server is working 2. Flood server with invalid requests 3. Send valid request to verify server still works 4. Check health endpoint """ print_info("Testing server stability under invalid requests...") async_client = server.get_async_client() # Step 1: Verify server is working with valid request response = await async_client.chat.completions.create(model=model_name, messages=[{ "role": "user", "content": "Hello" }], max_tokens=5) assert response is not None assert len(response.choices) > 0 print_info("Initial valid request succeeded") # Step 2: Send multiple invalid requests invalid_request_types = [ # Empty messages { "messages": [], "max_tokens": 5 }, # Missing role { "messages": [{ "content": "test" }], "max_tokens": 5 }, # Invalid temperature { "messages": [{ "role": "user", "content": "test" }], "temperature": -1 }, # Invalid max_tokens { "messages": [{ "role": "user", "content": "test" }], "max_tokens": -10 }, # Invalid top_p { "messages": [{ "role": "user", "content": "test" }], "top_p": 2.0 }, ] error_count = 0 for _ in range(20): # Send 100 total invalid requests (20 x 5 types) for invalid_params in invalid_request_types: try: await async_client.chat.completions.create(model=model_name, **invalid_params) except (openai.BadRequestError, openai.APIError): error_count += 1 except Exception as e: # Unexpected error - server might be unstable pytest.fail(f"Unexpected error during invalid request: {e}") print_info( f"Sent {error_count} invalid requests, all rejected as expected.") # Step 3: Verify server still works with valid request response = await async_client.chat.completions.create(model=model_name, messages=[{ "role": "user", "content": "Hello again" }], max_tokens=5) assert response is not None assert len(response.choices) > 0 print_info("Server still responsive after invalid requests.") # Step 4: Check health endpoint health_url = server.url_for("health") health_response = requests.get(health_url) assert health_response.status_code == 200 print_info("Health check passed.") @pytest.mark.asyncio async def test_concurrent_invalid_requests(server: RemoteOpenAIServer, model_name: str): """ E2E Test: Multiple concurrent invalid requests should not crash server Simulates multiple clients sending invalid requests simultaneously """ print_info("Testing concurrent invalid requests...") async_client = server.get_async_client() # Create 50 concurrent invalid requests tasks = [] for i in range(50): # Alternate between different types of invalid requests if i % 3 == 0: task = async_client.chat.completions.create( model=model_name, messages=[], # Empty messages max_tokens=5) elif i % 3 == 1: task = async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": "test" }], temperature=-1 # Invalid temperature ) else: task = async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": "test" }], max_tokens=-5 # Invalid max_tokens ) tasks.append(task) # Execute all concurrently and gather results results = await asyncio.gather(*tasks, return_exceptions=True) # All should be BadRequestError or APIError for i, result in enumerate(results): assert isinstance(result, (openai.BadRequestError, openai.APIError)), \ f"Request {i} should have failed with BadRequestError or APIError, got: {type(result)}" print_info( f"All {len(results)} concurrent invalid requests rejected properly") # Verify server still works response = await async_client.chat.completions.create(model=model_name, messages=[{ "role": "user", "content": "Final check" }], max_tokens=5) assert response is not None @pytest.mark.asyncio async def test_mixed_valid_invalid_requests(server: RemoteOpenAIServer, model_name: str): """ E2E Test: Mix of valid and invalid requests - server should handle both correctly Simulates real-world scenario where some clients send bad requests """ print_info("Testing mixed valid and invalid requests...") async_client = server.get_async_client() async def send_request(request_id: int) -> dict: """Send either valid or invalid request based on request_id""" result = { "id": request_id, "success": False, "expected_error": False, "unexpected_error": False } try: if request_id % 4 == 0: # Send invalid request (25% of requests) await async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": "test" }], temperature=-1 # Invalid ) result["unexpected_error"] = True # Shouldn't succeed else: # Send valid request (75% of requests) response = await async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": f"Request {request_id}" }], max_tokens=5, temperature=0.5) if response and len(response.choices) > 0: result["success"] = True except openai.BadRequestError: result["expected_error"] = True except Exception as e: print_error(f"Request {request_id} unexpected error: {e}") result["unexpected_error"] = True return result # Send 100 mixed requests tasks = [send_request(i) for i in range(100)] results = await asyncio.gather(*tasks) # Analyze results successful = sum(1 for r in results if r["success"]) expected_errors = sum(1 for r in results if r["expected_error"]) unexpected_errors = sum(1 for r in results if r["unexpected_error"]) print_info( f"Results: {successful} successful, {expected_errors} expected errors, {unexpected_errors} unexpected errors" ) # Assertions assert successful > 0, "Some valid requests should have succeeded" assert expected_errors > 0, "Some invalid requests should have been caught" assert unexpected_errors == 0, "No unexpected errors should occur" # Roughly 75% should succeed, 25% should fail assert successful >= 60, f"Expected ~75 successful requests, got {successful}" assert expected_errors >= 20, f"Expected ~25 failed requests, got {expected_errors}" @pytest.mark.asyncio async def test_health_check_during_errors(server: RemoteOpenAIServer, model_name: str): """ E2E Test: Health endpoints should remain functional even when receiving invalid requests """ print_info("Testing health check during error conditions...") async def send_invalid_requests(): """Background task sending invalid requests""" async_client = server.get_async_client() for _ in range(50): try: await async_client.chat.completions.create( model=model_name, messages=[], # Invalid max_tokens=5) except: pass # Expected to fail await asyncio.sleep(0.05) # Start background task sending invalid requests background_task = asyncio.create_task(send_invalid_requests()) # Meanwhile, check health endpoints repeatedly health_url = server.url_for("health") health_checks_passed = 0 for _ in range(20): await asyncio.sleep(0.1) try: health_response = requests.get(health_url, timeout=2) if health_response.status_code == 200: health_checks_passed += 1 except Exception as e: pytest.fail(f"Health check failed during error conditions: {e}") # Wait for background task to complete await background_task assert health_checks_passed == 20, f"All health checks should pass, got {health_checks_passed}/20" @pytest.mark.asyncio async def test_request_exceeds_context_length(async_client: openai.AsyncOpenAI, model_name: str): """Test handling of requests exceeding model's max context length""" print_info("Testing request exceeding context length...") # Generate extremely long prompt (> max_seq_len for TinyLlama) # TinyLlama has max_position_embeddings of 2048 very_long_prompt = "word " * 3000 # ~15000 characters, way over limit # Server should either reject or handle gracefully without crashing try: response = await async_client.chat.completions.create( model=model_name, messages=[{ "role": "user", "content": very_long_prompt }], max_tokens=10) # If it succeeds, verify response is valid assert response is not None print_info("Server handled oversized request gracefully") except (openai.BadRequestError, openai.APIError) as e: # Also acceptable - server rejected the request assert "length" in str(e).lower() or "token" in str( e).lower() or "context" in str(e).lower() def test_malformed_json_request(server: RemoteOpenAIServer): """Test that server rejects malformed JSON in HTTP requests""" print_info("Testing malformed JSON request...") chat_url = server.url_for("v1", "chat", "completions") # Send invalid JSON response = requests.post( chat_url, headers={"Content-Type": "application/json"}, data="{invalid json syntax here}", ) # Should return 400 Bad Request assert response.status_code == 400 def test_missing_content_type_header(server: RemoteOpenAIServer, model_name: str): """Test server behavior with missing Content-Type header""" print_info("Testing missing Content-Type header...") chat_url = server.url_for("v1", "chat", "completions") # Send request without Content-Type header import json payload = { "model": model_name, "messages": [{ "role": "user", "content": "Hello" }], "max_tokens": 5 } response = requests.post( chat_url, data=json.dumps(payload), # No Content-Type header ) # Server might accept it or reject it - either way it shouldn't crash assert response.status_code in [ 200, 400, 415 ] # Success, Bad Request, or Unsupported Media Type @pytest.mark.asyncio async def test_extremely_large_batch(async_client: openai.AsyncOpenAI, model_name: str): """Test handling of extremely large batch requests for completions""" print_info("Testing extremely large batch request...") # Try to send batch with many prompts large_batch = ["Hello"] * 1000 # 1000 prompts try: # This should either process or reject gracefully response = await async_client.completions.create(model=model_name, prompt=large_batch, max_tokens=1) # If successful, verify we got results assert response is not None assert hasattr(response, "choices") and len(response.choices) > 0 print_info("Server processed large batch.") except (openai.BadRequestError, openai.APIError) as e: # Server rejected - also acceptable assert "batch" in str(e).lower() or "too many" in str( e).lower() or "limit" in str(e).lower()