TensorRT-LLMs/tests/unittest/llmapi/apps/_test_openai_metrics.py
JunyiXu-nv 6d2daec5d0
[TRTLLM-8274][feat] Check if executor is shutdown in /health entrypoint (#9057)
Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
2025-12-04 13:49:40 +08:00

101 lines
3.9 KiB
Python

"""Test the metrics endpoint when using OpenAI API to send requests"""
from unittest.mock import patch
import pytest
from fastapi.testclient import TestClient
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.serve.openai_server import OpenAIServer
from ..test_llm import llama_model_path
pytestmark = pytest.mark.threadleak(enabled=False)
@pytest.fixture(scope="module")
def llm():
llm = PyTorchLLM(model=llama_model_path,
kv_cache_config=KvCacheConfig(),
enable_iter_perf_stats=True)
yield llm
llm.shutdown()
@pytest.fixture(scope="module")
def client(llm):
app_instance = OpenAIServer(llm,
model=llama_model_path,
tool_parser=None,
server_role=None,
metadata_server_cfg=None)
client = TestClient(app_instance.app)
yield client
@pytest.mark.parametrize("is_healthy,response_code", [(True, 200),
(False, 503)])
def test_health(client, llm, is_healthy, response_code):
if not is_healthy:
with patch.object(llm._executor, 'is_shutdown', return_value=True):
response = client.get("/health")
assert response.status_code == response_code
else:
response = client.get("/health")
assert response.status_code == response_code
def test_version(client):
response = client.get("/version")
assert response.status_code == 200
def test_metrics(client):
response = client.post("/v1/completions",
json={
"prompt": "A B C",
"model": llama_model_path,
"max_tokens": 10
})
assert response.status_code == 200
assert "D E F" in response.json()["choices"][0]["text"]
response = client.get("/metrics")
assert response.status_code == 200
response_dict = response.json()[0]
assert "cpuMemUsage" in response_dict
assert "gpuMemUsage" in response_dict
assert "inflightBatchingStats" in response_dict
assert "numContextRequests" in response_dict["inflightBatchingStats"]
assert "numCtxTokens" in response_dict["inflightBatchingStats"]
assert "numGenRequests" in response_dict["inflightBatchingStats"]
assert "numPausedRequests" in response_dict["inflightBatchingStats"]
assert "numScheduledRequests" in response_dict["inflightBatchingStats"]
assert "iter" in response_dict
assert "iterLatencyMS" in response_dict
assert "kvCacheStats" in response_dict
assert "allocNewBlocks" in response_dict["kvCacheStats"]
assert "allocTotalBlocks" in response_dict["kvCacheStats"]
assert "cacheHitRate" in response_dict["kvCacheStats"]
assert "freeNumBlocks" in response_dict["kvCacheStats"]
assert "maxNumBlocks" in response_dict["kvCacheStats"]
assert "missedBlocks" in response_dict["kvCacheStats"]
assert "reusedBlocks" in response_dict["kvCacheStats"]
assert "tokensPerBlock" in response_dict["kvCacheStats"]
assert "usedNumBlocks" in response_dict["kvCacheStats"]
assert "maxBatchSizeRuntime" in response_dict
assert "maxBatchSizeStatic" in response_dict
assert "maxBatchSizeTunerRecommended" in response_dict
assert "maxNumActiveRequests" in response_dict
assert "maxNumTokensRuntime" in response_dict
assert "maxNumTokensStatic" in response_dict
assert "maxNumTokensTunerRecommended" in response_dict
assert "newActiveRequestsQueueLatencyMS" in response_dict
assert "numActiveRequests" in response_dict
assert "numCompletedRequests" in response_dict
assert "numNewActiveRequests" in response_dict
assert "numQueuedRequests" in response_dict
assert "pinnedMemUsage" in response_dict
assert "staticBatchingStats" in response_dict
assert "timestamp" in response_dict