From b3e4ddc953ac3321fceffa34588f0f134164a6d6 Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Mon, 9 Feb 2026 10:25:32 +0100
Subject: [PATCH] [None][test] Enhance multi-GPU tests for IFB stats (#11239)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 tests/unittest/llmapi/test_llm.py             | 22 ++++++++----------
 tests/unittest/llmapi/test_llm_multi_gpu.py   | 23 ++++++++++++++-----
 .../llmapi/test_llm_multi_gpu_pytorch.py      | 19 +++++++++++++++
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index b80f600ce3..cc120ab1a1 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -2062,13 +2062,16 @@ def validate_stats(
         ifbStats = result["inflightBatchingStats"]
         print(f"iter: {iter}, ifbStats: {ifbStats}")
 
-    expected_num_results = max_tokens if pytorch_backend else max_tokens + 1
-    if enable_chunked_prefill:
-        expected_num_results += 1
-    assert len(results) == expected_num_results
+    # Filter out the results where no requests are scheduled
+    results = [
+        r for r in results
+        if r["inflightBatchingStats"]["numScheduledRequests"] > 0
+    ]
 
     context_iterations = 2 if enable_chunked_prefill else 1
     generation_iterations = max_tokens - 1
+    assert len(results) == context_iterations + generation_iterations
+
     microbatch_id = 0
     for iter, result in enumerate(results):
         ifbStats = result["inflightBatchingStats"]
@@ -2085,12 +2088,6 @@ def validate_stats(
             assert ifbStats["numGenRequests"] == 1, f"iter: {iter}"
             assert result["numActiveRequests"] == 1, f"iter: {iter}"
             assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
-        else:
-            assert ifbStats["numScheduledRequests"] == 0, f"iter: {iter}"
-            assert ifbStats["numContextRequests"] == 0, f"iter: {iter}"
-            assert ifbStats["numGenRequests"] == 0, f"iter: {iter}"
-            assert result["numActiveRequests"] == 0, f"iter: {iter}"
-            assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
 
         # In pipeline parallel mode, increment microbatch_id for each context iteration except the last one,
         # since the context chunks can be scheduled in each iteration.
@@ -2171,10 +2168,8 @@ def llm_get_stats_test_harness(tp_size: int = 1,
                  disable_overlap_scheduler=not use_overlap))
         LLM_CLASS = LLM_torch
     else:
-        LLM_CLASS = LLM
-
-    if not pytorch_backend:
         llm_args_extra["fast_build"] = True
+        LLM_CLASS = LLM
 
     with LLM_CLASS(model=llama_model_path,
                    kv_cache_config=global_kvcache_config,
@@ -2322,6 +2317,7 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
     with LLM_CLASS(model=llama_model_path,
                    kv_cache_config=global_kvcache_config,
                    tensor_parallel_size=tp_size,
+                   pipeline_parallel_size=pp_size,
                    **llm_args_extra) as llm:
 
         max_tokens = 6
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
index 971f25f11e..bdcdb1c1d7 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -444,15 +444,26 @@ DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {},
 
 
 @skip_single_gpu
-@pytest.mark.parametrize("pytorch_backend", [False, True])
-def test_llm_get_stats_tp2(pytorch_backend):
-    llm_get_stats_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
+def test_llm_get_stats_tp2():
+    llm_get_stats_test_harness(tp_size=2, pytorch_backend=False)
 
 
 @skip_single_gpu
-@pytest.mark.parametrize("pytorch_backend", [False, True])
-def test_llm_get_stats_async_tp2(pytorch_backend):
-    llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
+def test_llm_get_stats_pp2(enable_chunked_prefill):
+    llm_get_stats_test_harness(pp_size=2,
+                               pytorch_backend=False,
+                               enable_chunked_prefill=enable_chunked_prefill)
+
+
+@skip_single_gpu
+def test_llm_get_stats_async_tp2():
+    llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=False)
+
+
+@skip_single_gpu
+def test_llm_get_stats_async_pp2():
+    llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=False)
 
 
 @skip_ray
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
index 2229de60e8..0e15b38b8b 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -12,6 +12,7 @@ from .lora_test_utils import (
     check_phi3_lora_fused_modules_output_tp2_identical_to_tp1,
     test_lora_with_and_without_cuda_graph)
 from .test_llm import (_test_llm_capture_request_error, llama_model_path,
+                       llm_get_stats_async_test_harness,
                        llm_get_stats_test_harness,
                        llm_return_logprobs_test_harness,
                        tinyllama_logits_processor_test_harness)
@@ -169,3 +170,21 @@ def test_llm_get_stats_pp4(return_context_logits, enable_chunked_prefill,
         enable_chunked_prefill=enable_chunked_prefill,
         enable_iter_req_stats=enable_iter_req_stats,
     )
+
+
+@skip_ray
+@pytest.mark.gpu2
+def test_llm_get_stats_tp2():
+    llm_get_stats_test_harness(tp_size=2, pytorch_backend=True)
+
+
+@skip_ray
+@pytest.mark.gpu2
+def test_llm_get_stats_async_tp2():
+    llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=True)
+
+
+@skip_ray
+@pytest.mark.gpu2
+def test_llm_get_stats_async_pp2():
+    llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=True)