From b3e4ddc953ac3321fceffa34588f0f134164a6d6 Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Mon, 9 Feb 2026 10:25:32 +0100 Subject: [PATCH] [None][test] Enhance multi-GPU tests for IFB stats (#11239) Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 22 ++++++++---------- tests/unittest/llmapi/test_llm_multi_gpu.py | 23 ++++++++++++++----- .../llmapi/test_llm_multi_gpu_pytorch.py | 19 +++++++++++++++ 3 files changed, 45 insertions(+), 19 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index b80f600ce3..cc120ab1a1 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -2062,13 +2062,16 @@ def validate_stats( ifbStats = result["inflightBatchingStats"] print(f"iter: {iter}, ifbStats: {ifbStats}") - expected_num_results = max_tokens if pytorch_backend else max_tokens + 1 - if enable_chunked_prefill: - expected_num_results += 1 - assert len(results) == expected_num_results + # Filter out the results where no requests are scheduled + results = [ + r for r in results + if r["inflightBatchingStats"]["numScheduledRequests"] > 0 + ] context_iterations = 2 if enable_chunked_prefill else 1 generation_iterations = max_tokens - 1 + assert len(results) == context_iterations + generation_iterations + microbatch_id = 0 for iter, result in enumerate(results): ifbStats = result["inflightBatchingStats"] @@ -2085,12 +2088,6 @@ def validate_stats( assert ifbStats["numGenRequests"] == 1, f"iter: {iter}" assert result["numActiveRequests"] == 1, f"iter: {iter}" assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}" - else: - assert ifbStats["numScheduledRequests"] == 0, f"iter: {iter}" - assert ifbStats["numContextRequests"] == 0, f"iter: {iter}" - assert ifbStats["numGenRequests"] == 0, f"iter: {iter}" - assert result["numActiveRequests"] == 0, f"iter: {iter}" - assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}" # In pipeline parallel mode, increment microbatch_id for each context iteration except the last one, # since the context chunks can be scheduled in each iteration. @@ -2171,10 +2168,8 @@ def llm_get_stats_test_harness(tp_size: int = 1, disable_overlap_scheduler=not use_overlap)) LLM_CLASS = LLM_torch else: - LLM_CLASS = LLM - - if not pytorch_backend: llm_args_extra["fast_build"] = True + LLM_CLASS = LLM with LLM_CLASS(model=llama_model_path, kv_cache_config=global_kvcache_config, @@ -2322,6 +2317,7 @@ def llm_get_stats_async_test_harness(tp_size: int = 1, with LLM_CLASS(model=llama_model_path, kv_cache_config=global_kvcache_config, tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, **llm_args_extra) as llm: max_tokens = 6 diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index 971f25f11e..bdcdb1c1d7 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -444,15 +444,26 @@ DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {}, @skip_single_gpu -@pytest.mark.parametrize("pytorch_backend", [False, True]) -def test_llm_get_stats_tp2(pytorch_backend): - llm_get_stats_test_harness(tp_size=2, pytorch_backend=pytorch_backend) +def test_llm_get_stats_tp2(): + llm_get_stats_test_harness(tp_size=2, pytorch_backend=False) @skip_single_gpu -@pytest.mark.parametrize("pytorch_backend", [False, True]) -def test_llm_get_stats_async_tp2(pytorch_backend): - llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend) +@pytest.mark.parametrize("enable_chunked_prefill", [False, True]) +def test_llm_get_stats_pp2(enable_chunked_prefill): + llm_get_stats_test_harness(pp_size=2, + pytorch_backend=False, + enable_chunked_prefill=enable_chunked_prefill) + + +@skip_single_gpu +def test_llm_get_stats_async_tp2(): + llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=False) + + +@skip_single_gpu +def test_llm_get_stats_async_pp2(): + llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=False) @skip_ray diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index 2229de60e8..0e15b38b8b 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -12,6 +12,7 @@ from .lora_test_utils import ( check_phi3_lora_fused_modules_output_tp2_identical_to_tp1, test_lora_with_and_without_cuda_graph) from .test_llm import (_test_llm_capture_request_error, llama_model_path, + llm_get_stats_async_test_harness, llm_get_stats_test_harness, llm_return_logprobs_test_harness, tinyllama_logits_processor_test_harness) @@ -169,3 +170,21 @@ def test_llm_get_stats_pp4(return_context_logits, enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill, enable_iter_req_stats=enable_iter_req_stats, ) + + +@skip_ray +@pytest.mark.gpu2 +def test_llm_get_stats_tp2(): + llm_get_stats_test_harness(tp_size=2, pytorch_backend=True) + + +@skip_ray +@pytest.mark.gpu2 +def test_llm_get_stats_async_tp2(): + llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=True) + + +@skip_ray +@pytest.mark.gpu2 +def test_llm_get_stats_async_pp2(): + llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=True)