[TRTLLM-8341][feat] Support benchmark_serving with responses API

Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-12-16 04:37:28 +00:00 · 2025-12-16 04:37:28 +00:00 · b938657dd4
commit b938657dd4
parent 0517b62789
3 changed files with 132 additions and 2 deletions
--- a/tensorrt_llm/serve/scripts/backend_request_func.py
+++ b/tensorrt_llm/serve/scripts/backend_request_func.py
@ -414,6 +414,126 @@ async def async_request_openai_chat_completions(
    return output


+async def async_request_openai_responses(
+    request_func_input: RequestFuncInput,
+    streaming: bool = True,
+    pbar: Optional[tqdm] = None,
+    session: Optional[aiohttp.ClientSession] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    if not api_url.endswith("responses"):
+        raise ValueError("OpenAI Responses API URL must end with 'responses'.")
+
+    request_session = aiohttp.ClientSession(
+        trust_env=True,
+        timeout=AIOHTTP_TIMEOUT,
+        connector=aiohttp.TCPConnector(
+            limit=0, limit_per_host=0)) if session is None else session
+
+    if not isinstance(request_func_input.prompt, str):
+        raise ValueError("Prompt must be a string.")
+
+    if request_func_input.multi_modal_content:
+        raise NotImplementedError(
+            "Multi-modal content has not been supported by Responses API.")
+
+    if request_func_input.ignore_eos:
+        raise NotImplementedError(
+            "Ignore EOS is not supported for Responses API.")
+
+    payload = {
+        "model": request_func_input.model_name \
+            if request_func_input.model_name else request_func_input.model,
+        "input": request_func_input.prompt,
+        "temperature": 0.0,
+        "max_output_tokens": request_func_input.output_len,
+        "stream": streaming,
+    }
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    output = RequestFuncOutput()
+    output.prompt_len = request_func_input.prompt_len
+
+    generated_text = ""
+    ttft = 0.0
+    st = time.perf_counter()
+    most_recent_timestamp = st
+    try:
+        async with request_session.post(url=api_url,
+                                        json=payload,
+                                        headers=headers) as response:
+            if response.status == 200:
+                output.success = True
+                if streaming:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk.startswith("event"):
+                            continue
+
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if delta := data.get("delta"):
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += delta or ""
+
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "output_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    content = await response.content.read()
+                    data = json.loads(content.decode())
+                    output.generated_text = data["output"][0]["content"][0][
+                        "text"]
+                    if data["usage"] is not None:
+                        output.output_tokens = data["usage"]["output_tokens"]
+                    output.itl = []
+                    output.latency = time.perf_counter() - st
+                    output.ttft = -1
+
+            else:
+                # TODO: Need to store the status code to debug and report
+                output.error = response.reason or ""
+                output.success = False
+    except Exception as e:
+        output.success = False
+        exc_info = sys.exc_info()
+        output.error = "".join(traceback.format_exception(*exc_info))
+        output.exception_type = e.__class__.__name__
+    finally:
+        if session is None:
+            await request_session.close()
+
+    if pbar:
+        pbar.update(1)
+
+    return output
+
+
 def get_tokenizer(
    pretrained_model_name_or_path: str,
    tokenizer_mode: str = "auto",
@ -435,10 +555,12 @@ def get_tokenizer(
 ASYNC_REQUEST_FUNCS = {
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
+    "openai-responses": async_request_openai_responses,
 }

 OPENAI_COMPATIBLE_BACKENDS = [
    k for k, v in ASYNC_REQUEST_FUNCS.items()
    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
+             async_request_openai_chat_completions,
+             async_request_openai_responses)
 ]
--- a/tensorrt_llm/serve/scripts/benchmark_serving.py
+++ b/tensorrt_llm/serve/scripts/benchmark_serving.py
@ -699,6 +699,8 @@ def main(args: argparse.Namespace):

    if backend == "openai-chat":
        args.endpoint = "/v1/chat/completions"
+    elif backend == "openai-responses":
+        args.endpoint = "/v1/responses"

    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_benchmark.py
@ -50,14 +50,20 @@ def dataset_path(dataset_name: str):
    pytest.param("gpt_oss/gpt-oss-20b", marks=skip_pre_hopper)
 ],
                         indirect=True)
+@pytest.mark.parametrize("backend", [
+    "openai",
+    "openai-responses",
+])
 def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
-                                model_path: str):
+                                model_path: str, backend: str):
    model_name = model_path.split("/")[-1]
    client_script = os.path.join(benchmark_root, "benchmark_serving.py")
    dataset = dataset_path("sharegpt")
    benchmark_cmd = [
        "python3",
        client_script,
+        "--backend",
+        backend,
        "--dataset-name",
        "sharegpt",
        "--model",