mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[None][test] Enhance multi-GPU tests for IFB stats (#11239)
Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
parent
31db399042
commit
b3e4ddc953
@ -2062,13 +2062,16 @@ def validate_stats(
|
||||
ifbStats = result["inflightBatchingStats"]
|
||||
print(f"iter: {iter}, ifbStats: {ifbStats}")
|
||||
|
||||
expected_num_results = max_tokens if pytorch_backend else max_tokens + 1
|
||||
if enable_chunked_prefill:
|
||||
expected_num_results += 1
|
||||
assert len(results) == expected_num_results
|
||||
# Filter out the results where no requests are scheduled
|
||||
results = [
|
||||
r for r in results
|
||||
if r["inflightBatchingStats"]["numScheduledRequests"] > 0
|
||||
]
|
||||
|
||||
context_iterations = 2 if enable_chunked_prefill else 1
|
||||
generation_iterations = max_tokens - 1
|
||||
assert len(results) == context_iterations + generation_iterations
|
||||
|
||||
microbatch_id = 0
|
||||
for iter, result in enumerate(results):
|
||||
ifbStats = result["inflightBatchingStats"]
|
||||
@ -2085,12 +2088,6 @@ def validate_stats(
|
||||
assert ifbStats["numGenRequests"] == 1, f"iter: {iter}"
|
||||
assert result["numActiveRequests"] == 1, f"iter: {iter}"
|
||||
assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
|
||||
else:
|
||||
assert ifbStats["numScheduledRequests"] == 0, f"iter: {iter}"
|
||||
assert ifbStats["numContextRequests"] == 0, f"iter: {iter}"
|
||||
assert ifbStats["numGenRequests"] == 0, f"iter: {iter}"
|
||||
assert result["numActiveRequests"] == 0, f"iter: {iter}"
|
||||
assert ifbStats["microBatchId"] == microbatch_id, f"iter: {iter}"
|
||||
|
||||
# In pipeline parallel mode, increment microbatch_id for each context iteration except the last one,
|
||||
# since the context chunks can be scheduled in each iteration.
|
||||
@ -2171,10 +2168,8 @@ def llm_get_stats_test_harness(tp_size: int = 1,
|
||||
disable_overlap_scheduler=not use_overlap))
|
||||
LLM_CLASS = LLM_torch
|
||||
else:
|
||||
LLM_CLASS = LLM
|
||||
|
||||
if not pytorch_backend:
|
||||
llm_args_extra["fast_build"] = True
|
||||
LLM_CLASS = LLM
|
||||
|
||||
with LLM_CLASS(model=llama_model_path,
|
||||
kv_cache_config=global_kvcache_config,
|
||||
@ -2322,6 +2317,7 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
|
||||
with LLM_CLASS(model=llama_model_path,
|
||||
kv_cache_config=global_kvcache_config,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
**llm_args_extra) as llm:
|
||||
|
||||
max_tokens = 6
|
||||
|
||||
@ -444,15 +444,26 @@ DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {},
|
||||
|
||||
|
||||
@skip_single_gpu
|
||||
@pytest.mark.parametrize("pytorch_backend", [False, True])
|
||||
def test_llm_get_stats_tp2(pytorch_backend):
|
||||
llm_get_stats_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
|
||||
def test_llm_get_stats_tp2():
|
||||
llm_get_stats_test_harness(tp_size=2, pytorch_backend=False)
|
||||
|
||||
|
||||
@skip_single_gpu
|
||||
@pytest.mark.parametrize("pytorch_backend", [False, True])
|
||||
def test_llm_get_stats_async_tp2(pytorch_backend):
|
||||
llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
|
||||
def test_llm_get_stats_pp2(enable_chunked_prefill):
|
||||
llm_get_stats_test_harness(pp_size=2,
|
||||
pytorch_backend=False,
|
||||
enable_chunked_prefill=enable_chunked_prefill)
|
||||
|
||||
|
||||
@skip_single_gpu
|
||||
def test_llm_get_stats_async_tp2():
|
||||
llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=False)
|
||||
|
||||
|
||||
@skip_single_gpu
|
||||
def test_llm_get_stats_async_pp2():
|
||||
llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=False)
|
||||
|
||||
|
||||
@skip_ray
|
||||
|
||||
@ -12,6 +12,7 @@ from .lora_test_utils import (
|
||||
check_phi3_lora_fused_modules_output_tp2_identical_to_tp1,
|
||||
test_lora_with_and_without_cuda_graph)
|
||||
from .test_llm import (_test_llm_capture_request_error, llama_model_path,
|
||||
llm_get_stats_async_test_harness,
|
||||
llm_get_stats_test_harness,
|
||||
llm_return_logprobs_test_harness,
|
||||
tinyllama_logits_processor_test_harness)
|
||||
@ -169,3 +170,21 @@ def test_llm_get_stats_pp4(return_context_logits, enable_chunked_prefill,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
enable_iter_req_stats=enable_iter_req_stats,
|
||||
)
|
||||
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.gpu2
|
||||
def test_llm_get_stats_tp2():
|
||||
llm_get_stats_test_harness(tp_size=2, pytorch_backend=True)
|
||||
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.gpu2
|
||||
def test_llm_get_stats_async_tp2():
|
||||
llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=True)
|
||||
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.gpu2
|
||||
def test_llm_get_stats_async_pp2():
|
||||
llm_get_stats_async_test_harness(pp_size=2, pytorch_backend=True)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user