fix: Fix warmup phase batch size out of range. (#4986)

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
Yukun He 2025-06-09 19:19:16 +08:00 committed by GitHub
parent 88480197da
commit 137fe35539
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 8 additions and 9 deletions

2
3rdparty/cutlass vendored

@ -1 +1 @@
Subproject commit 8206e7a0f57a9a057cdd2c3bb4899bd5154a82e1
Subproject commit afa1772203677c5118fcd82537a9c8fefbcc7008

View File

@ -584,16 +584,17 @@ class PyTorchModelEngine(ModelEngine):
available_blocks = kv_cache_manager.get_num_free_blocks()
maximum_tunable_num_tokens = min(
self.batch_size * num_tokens_per_request, self.max_num_tokens,
available_blocks * kv_cache_manager.tokens_per_block)
# Calculate number of full-length requests and remaining tokens
# Each request has num_tokens_per_request tokens, except possibly the last one
full_len_request_num = self.max_num_tokens // num_tokens_per_request
remaining_tokens = self.max_num_tokens % num_tokens_per_request
full_len_request_num = maximum_tunable_num_tokens // num_tokens_per_request
remaining_tokens = maximum_tunable_num_tokens % num_tokens_per_request
request_num = full_len_request_num if remaining_tokens == 0 else full_len_request_num + 1
if self.max_num_tokens > available_blocks * kv_cache_manager.tokens_per_block:
return None, None
requests = kv_cache_manager.add_dummy_requests(
request_ids=list(range(full_len_request_num)),
token_nums=[num_tokens_per_request] * full_len_request_num,
@ -617,7 +618,7 @@ class PyTorchModelEngine(ModelEngine):
result.context_requests = requests
result.generation_requests = []
return result, _create_extra_inputs(1, self.max_num_tokens)
return result, _create_extra_inputs(1, maximum_tunable_num_tokens)
@contextlib.contextmanager
def release_batch(result):

View File

@ -428,7 +428,6 @@ test_e2e.py::test_ptq_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-
unittest/_torch/auto_deploy/integration/test_ad_build.py SKIP (https://nvbugs/5318103)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143)
test_e2e.py::test_openai_reasoning SKIP (https://nvbugs/5310329)
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-summarization_long] SKIP (https://nvbugs/5324976)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5322354)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5322354)

View File

@ -1930,7 +1930,6 @@ def test_llm_get_stats(return_context_logits, enable_iter_req_stats):
def test_llm_get_queued_stats():
pytest.skip("https://nvbugspro.nvidia.com/bug/5325642")
enable_iter_req_stats = True
use_overlap = False
tp_size = 1