[TRTLLM-8341][feat] Support benchmark_serving with responses API

Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
This commit is contained in:
Junyi Xu 2025-12-16 04:37:28 +00:00 committed by JunyiXu-nv
parent 0517b62789
commit b938657dd4
3 changed files with 132 additions and 2 deletions

View File

@ -414,6 +414,126 @@ async def async_request_openai_chat_completions(
return output
async def async_request_openai_responses(
request_func_input: RequestFuncInput,
streaming: bool = True,
pbar: Optional[tqdm] = None,
session: Optional[aiohttp.ClientSession] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
if not api_url.endswith("responses"):
raise ValueError("OpenAI Responses API URL must end with 'responses'.")
request_session = aiohttp.ClientSession(
trust_env=True,
timeout=AIOHTTP_TIMEOUT,
connector=aiohttp.TCPConnector(
limit=0, limit_per_host=0)) if session is None else session
if not isinstance(request_func_input.prompt, str):
raise ValueError("Prompt must be a string.")
if request_func_input.multi_modal_content:
raise NotImplementedError(
"Multi-modal content has not been supported by Responses API.")
if request_func_input.ignore_eos:
raise NotImplementedError(
"Ignore EOS is not supported for Responses API.")
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"input": request_func_input.prompt,
"temperature": 0.0,
"max_output_tokens": request_func_input.output_len,
"stream": streaming,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with request_session.post(url=api_url,
json=payload,
headers=headers) as response:
if response.status == 200:
output.success = True
if streaming:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk.startswith("event"):
continue
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if delta := data.get("delta"):
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
generated_text += delta or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"output_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
content = await response.content.read()
data = json.loads(content.decode())
output.generated_text = data["output"][0]["content"][0][
"text"]
if data["usage"] is not None:
output.output_tokens = data["usage"]["output_tokens"]
output.itl = []
output.latency = time.perf_counter() - st
output.ttft = -1
else:
# TODO: Need to store the status code to debug and report
output.error = response.reason or ""
output.success = False
except Exception as e:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
output.exception_type = e.__class__.__name__
finally:
if session is None:
await request_session.close()
if pbar:
pbar.update(1)
return output
def get_tokenizer(
pretrained_model_name_or_path: str,
tokenizer_mode: str = "auto",
@ -435,10 +555,12 @@ def get_tokenizer(
ASYNC_REQUEST_FUNCS = {
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"openai-responses": async_request_openai_responses,
}
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_openai_chat_completions)
async_request_openai_chat_completions,
async_request_openai_responses)
]

View File

@ -699,6 +699,8 @@ def main(args: argparse.Namespace):
if backend == "openai-chat":
args.endpoint = "/v1/chat/completions"
elif backend == "openai-responses":
args.endpoint = "/v1/responses"
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"

View File

@ -50,14 +50,20 @@ def dataset_path(dataset_name: str):
pytest.param("gpt_oss/gpt-oss-20b", marks=skip_pre_hopper)
],
indirect=True)
@pytest.mark.parametrize("backend", [
"openai",
"openai-responses",
])
def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
model_path: str):
model_path: str, backend: str):
model_name = model_path.split("/")[-1]
client_script = os.path.join(benchmark_root, "benchmark_serving.py")
dataset = dataset_path("sharegpt")
benchmark_cmd = [
"python3",
client_script,
"--backend",
backend,
"--dataset-name",
"sharegpt",
"--model",