[None][infra] Enable test of chunked prefill with logit post processor (#6483)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
This commit is contained in:
Leslie Fang 2025-08-04 13:46:07 +08:00 committed by GitHub
parent a60190836c
commit b9fe0fa7ec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 3 deletions

View File

@ -14,5 +14,5 @@
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | |
| KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | |
| Slide Window Attention | Yes | Yes | Yes | Untested | No | Untested | Untested | Untested | Yes | Yes | WIP | --- | | |
| Logits Post Processor | No | Yes | Yes | No | Untested | No | No | No | Yes | Yes | Yes | Yes | --- | |
| Logits Post Processor | No | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | |
| Guided Decoding | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |

View File

@ -1149,6 +1149,11 @@ def tinyllama_logits_processor_test_harness(backend=None, **llm_kwargs):
sampling_params = SamplingParams(
max_tokens=6, logits_processor=MyLogitsProcessor(biased_word_id))
prompts = ["A B C"]
if llm_kwargs.get('enable_chunked_prefill', None):
prompts[0] = prompts[0] * 256
llm_kwargs["max_num_tokens"] = 256
llm_test_harness(
llama_model_path,
prompts, ["Z Z Z Z Z Z"],

View File

@ -37,8 +37,10 @@ from transformers import AutoModelForCausalLM
@force_ampere
def test_tinyllama_logits_processor():
tinyllama_logits_processor_test_harness(backend="pytorch")
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
def test_tinyllama_logits_processor(enable_chunked_prefill):
tinyllama_logits_processor_test_harness(
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
@pytest.mark.parametrize(