[None][test] Enhance multi-GPU tests for IFB stats (#11239)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2026-02-16 15:55:08 +08:00 · 2026-02-09 10:25:32 +01:00 · 2026-02-09 10:25:32 +01:00 · b3e4ddc953
commit b3e4ddc953
parent 31db399042
3 changed files with 45 additions and 19 deletions
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@ -2062,13 +2062,16 @@ def validate_stats(
        ifbStats = result["inflightBatchingStats"]
        print(f"iter: {iter}, ifbStats: {ifbStats}")

-    expected_num_results = max_tokens if pytorch_backend else max_tokens + 1
-    if enable_chunked_prefill:
-        expected_num_results += 1
-    assert len(results) == expected_num_results
+    # Filter out the results where no requests are scheduled
+    results = [
+        r for r in results
+        if r["inflightBatchingStats"]["numScheduledRequests"] > 0
+    ]

    context_iterations = 2 if enable_chunked_prefill else 1
    generation_iterations = max_tokens - 1
+    assert len(results) == context_iterations + generation_iterations
+
    microbatch_id = 0
    for iter, result in enumerate(results):
        ifbStats = result["inflightBatchingStats"]
@ -2085,12 +2088,6 @@ def validate_stats(
            assert ifbStats["numGenRequests"] == 1, f"iter: {iter}"
            assert result["numActiveRequests"] == 1, f"iter: {iter}"
            assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
-        else:
-            assert ifbStats["numScheduledRequests"] == 0, f"iter: {iter}"
-            assert ifbStats["numContextRequests"] == 0, f"iter: {iter}"
-            assert ifbStats["numGenRequests"] == 0, f"iter: {iter}"
-            assert result["numActiveRequests"] == 0, f"iter: {iter}"
-            assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"

        # In pipeline parallel mode, increment microbatch_id for each context iteration except the last one,
        # since the context chunks can be scheduled in each iteration.
@ -2171,10 +2168,8 @@ def llm_get_stats_test_harness(tp_size: int = 1,
                 disable_overlap_scheduler=not use_overlap))
        LLM_CLASS = LLM_torch
    else:
-        LLM_CLASS = LLM
-
-    if not pytorch_backend:
        llm_args_extra["fast_build"] = True
+        LLM_CLASS = LLM

    with LLM_CLASS(model=llama_model_path,
                   kv_cache_config=global_kvcache_config,
@ -2322,6 +2317,7 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
    with LLM_CLASS(model=llama_model_path,
                   kv_cache_config=global_kvcache_config,
                   tensor_parallel_size=tp_size,
+                   pipeline_parallel_size=pp_size,
                   **llm_args_extra) as llm:

        max_tokens = 6
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@ -444,15 +444,26 @@ DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {},


@skip_single_gpu
-@pytest.mark.parametrize("pytorch_backend", [False, True])
-def test_llm_get_stats_tp2(pytorch_backend):
-    llm_get_stats_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
+def test_llm_get_stats_tp2():
+    llm_get_stats_test_harness(tp_size=2, pytorch_backend=False)


@skip_single_gpu
-@pytest.mark.parametrize("pytorch_backend", [False, True])
-def test_llm_get_stats_async_tp2(pytorch_backend):
-    llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
+def test_llm_get_stats_pp2(enable_chunked_prefill):
+    llm_get_stats_test_harness(pp_size=2,
+                               pytorch_backend=False,
+                               enable_chunked_prefill=enable_chunked_prefill)
+
+
+@skip_single_gpu
+def test_llm_get_stats_async_tp2():
+    llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=False)
+
+
+@skip_single_gpu
+def test_llm_get_stats_async_pp2():
+    llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=False)


@skip_ray
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@ -12,6 +12,7 @@ from .lora_test_utils import (
    check_phi3_lora_fused_modules_output_tp2_identical_to_tp1,
    test_lora_with_and_without_cuda_graph)
 from .test_llm import (_test_llm_capture_request_error, llama_model_path,
+                       llm_get_stats_async_test_harness,
                       llm_get_stats_test_harness,
                       llm_return_logprobs_test_harness,
                       tinyllama_logits_processor_test_harness)
@ -169,3 +170,21 @@ def test_llm_get_stats_pp4(return_context_logits, enable_chunked_prefill,
        enable_chunked_prefill=enable_chunked_prefill,
        enable_iter_req_stats=enable_iter_req_stats,
    )
+
+
+@skip_ray
+@pytest.mark.gpu2
+def test_llm_get_stats_tp2():
+    llm_get_stats_test_harness(tp_size=2, pytorch_backend=True)
+
+
+@skip_ray
+@pytest.mark.gpu2
+def test_llm_get_stats_async_tp2():
+    llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=True)
+
+
+@skip_ray
+@pytest.mark.gpu2
+def test_llm_get_stats_async_pp2():
+    llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=True)