[None][infra] Enable test of chunked prefill with logit post processor (#6483)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-08-04 13:46:07 +08:00 · 2025-08-04 13:46:07 +08:00 · b9fe0fa7ec
commit b9fe0fa7ec
parent a60190836c
3 changed files with 10 additions and 3 deletions
--- a/docs/source/torch/features/feature_combination_matrix.md
+++ b/docs/source/torch/features/feature_combination_matrix.md
@ -14,5 +14,5 @@
 | TLLM C++ Sampler           | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |
 | KV Cache Reuse             | Yes               | Yes        | Yes                        | Untested              | Yes             | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
 | Slide Window Attention     | Yes               | Yes        | Yes                        | Untested              | No                    | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
-| Logits Post Processor      | No                | Yes        | Yes                        | No                    | Untested        | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
+| Logits Post Processor      | No                | Yes        | Yes                        | No                    | Yes            | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
 | Guided Decoding            | Yes               | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@ -1149,6 +1149,11 @@ def tinyllama_logits_processor_test_harness(backend=None, **llm_kwargs):
    sampling_params = SamplingParams(
        max_tokens=6, logits_processor=MyLogitsProcessor(biased_word_id))

+    prompts = ["A B C"]
+    if llm_kwargs.get('enable_chunked_prefill', None):
+        prompts[0] = prompts[0] * 256
+        llm_kwargs["max_num_tokens"] = 256
+
    llm_test_harness(
        llama_model_path,
        prompts, ["Z Z Z Z Z Z"],
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@ -37,8 +37,10 @@ from transformers import AutoModelForCausalLM


@force_ampere
-def test_tinyllama_logits_processor():
-    tinyllama_logits_processor_test_harness(backend="pytorch")
+@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
+def test_tinyllama_logits_processor(enable_chunked_prefill):
+    tinyllama_logits_processor_test_harness(
+        backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)


@pytest.mark.parametrize(