mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][infra] Enable test of chunked prefill with logit post processor (#6483)
Signed-off-by: leslie-fang25 <leslief@nvidia.com>
This commit is contained in:
parent
a60190836c
commit
b9fe0fa7ec
@ -14,5 +14,5 @@
|
||||
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | |
|
||||
| KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | |
|
||||
| Slide Window Attention | Yes | Yes | Yes | Untested | No | Untested | Untested | Untested | Yes | Yes | WIP | --- | | |
|
||||
| Logits Post Processor | No | Yes | Yes | No | Untested | No | No | No | Yes | Yes | Yes | Yes | --- | |
|
||||
| Logits Post Processor | No | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | |
|
||||
| Guided Decoding | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |
|
||||
|
||||
@ -1149,6 +1149,11 @@ def tinyllama_logits_processor_test_harness(backend=None, **llm_kwargs):
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=6, logits_processor=MyLogitsProcessor(biased_word_id))
|
||||
|
||||
prompts = ["A B C"]
|
||||
if llm_kwargs.get('enable_chunked_prefill', None):
|
||||
prompts[0] = prompts[0] * 256
|
||||
llm_kwargs["max_num_tokens"] = 256
|
||||
|
||||
llm_test_harness(
|
||||
llama_model_path,
|
||||
prompts, ["Z Z Z Z Z Z"],
|
||||
|
||||
@ -37,8 +37,10 @@ from transformers import AutoModelForCausalLM
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_tinyllama_logits_processor():
|
||||
tinyllama_logits_processor_test_harness(backend="pytorch")
|
||||
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
|
||||
def test_tinyllama_logits_processor(enable_chunked_prefill):
|
||||
tinyllama_logits_processor_test_harness(
|
||||
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user