[None][test] Enhance multi-GPU tests for IFB stats (#11239)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
Robin Kobus 2026-02-09 10:25:32 +01:00 committed by GitHub
parent 31db399042
commit b3e4ddc953
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 45 additions and 19 deletions

View File

@ -2062,13 +2062,16 @@ def validate_stats(
ifbStats = result["inflightBatchingStats"]
print(f"iter: {iter}, ifbStats: {ifbStats}")
expected_num_results = max_tokens if pytorch_backend else max_tokens + 1
if enable_chunked_prefill:
expected_num_results += 1
assert len(results) == expected_num_results
# Filter out the results where no requests are scheduled
results = [
r for r in results
if r["inflightBatchingStats"]["numScheduledRequests"] > 0
]
context_iterations = 2 if enable_chunked_prefill else 1
generation_iterations = max_tokens - 1
assert len(results) == context_iterations + generation_iterations
microbatch_id = 0
for iter, result in enumerate(results):
ifbStats = result["inflightBatchingStats"]
@ -2085,12 +2088,6 @@ def validate_stats(
assert ifbStats["numGenRequests"] == 1, f"iter: {iter}"
assert result["numActiveRequests"] == 1, f"iter: {iter}"
assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
else:
assert ifbStats["numScheduledRequests"] == 0, f"iter: {iter}"
assert ifbStats["numContextRequests"] == 0, f"iter: {iter}"
assert ifbStats["numGenRequests"] == 0, f"iter: {iter}"
assert result["numActiveRequests"] == 0, f"iter: {iter}"
assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
# In pipeline parallel mode, increment microbatch_id for each context iteration except the last one,
# since the context chunks can be scheduled in each iteration.
@ -2171,10 +2168,8 @@ def llm_get_stats_test_harness(tp_size: int = 1,
disable_overlap_scheduler=not use_overlap))
LLM_CLASS = LLM_torch
else:
LLM_CLASS = LLM
if not pytorch_backend:
llm_args_extra["fast_build"] = True
LLM_CLASS = LLM
with LLM_CLASS(model=llama_model_path,
kv_cache_config=global_kvcache_config,
@ -2322,6 +2317,7 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
with LLM_CLASS(model=llama_model_path,
kv_cache_config=global_kvcache_config,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
**llm_args_extra) as llm:
max_tokens = 6

View File

@ -444,15 +444,26 @@ DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {},
@skip_single_gpu
@pytest.mark.parametrize("pytorch_backend", [False, True])
def test_llm_get_stats_tp2(pytorch_backend):
llm_get_stats_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
def test_llm_get_stats_tp2():
llm_get_stats_test_harness(tp_size=2, pytorch_backend=False)
@skip_single_gpu
@pytest.mark.parametrize("pytorch_backend", [False, True])
def test_llm_get_stats_async_tp2(pytorch_backend):
llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
def test_llm_get_stats_pp2(enable_chunked_prefill):
llm_get_stats_test_harness(pp_size=2,
pytorch_backend=False,
enable_chunked_prefill=enable_chunked_prefill)
@skip_single_gpu
def test_llm_get_stats_async_tp2():
llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=False)
@skip_single_gpu
def test_llm_get_stats_async_pp2():
llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=False)
@skip_ray

View File

@ -12,6 +12,7 @@ from .lora_test_utils import (
check_phi3_lora_fused_modules_output_tp2_identical_to_tp1,
test_lora_with_and_without_cuda_graph)
from .test_llm import (_test_llm_capture_request_error, llama_model_path,
llm_get_stats_async_test_harness,
llm_get_stats_test_harness,
llm_return_logprobs_test_harness,
tinyllama_logits_processor_test_harness)
@ -169,3 +170,21 @@ def test_llm_get_stats_pp4(return_context_logits, enable_chunked_prefill,
enable_chunked_prefill=enable_chunked_prefill,
enable_iter_req_stats=enable_iter_req_stats,
)
@skip_ray
@pytest.mark.gpu2
def test_llm_get_stats_tp2():
llm_get_stats_test_harness(tp_size=2, pytorch_backend=True)
@skip_ray
@pytest.mark.gpu2
def test_llm_get_stats_async_tp2():
llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=True)
@skip_ray
@pytest.mark.gpu2
def test_llm_get_stats_async_pp2():
llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=True)