move pytorch tests of LLM API into separate test files (#3745)

* move pytorch tests of LLM API into separate test files

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* polish

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* update

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* clean

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

---------

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
QI JUN 2025-04-22 14:36:59 -07:00 committed by GitHub
parent b16a127026
commit 257abfbc51
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 135 additions and 77 deletions

View File

@ -524,6 +524,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
"tests/integration/test_lists/test-db/l0_dgx_h200.yml",
"tests/unittest/_torch/multi_gpu/",
"tests/unittest/_torch/multi_gpu_modeling/",
"tests/unittest/llmapi/test_llm_pytorch.py",
"tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
"jenkins/L0_Test.groovy",
]

View File

@ -1199,6 +1199,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"A30-TensorRT-2": ["a30", "l0_a30", 2, 4],
"A30-TensorRT-3": ["a30", "l0_a30", 3, 4],
"A30-TensorRT-4": ["a30", "l0_a30", 4, 4],
"A100X-PyTorch-1": ["a100x", "l0_a100", 1, 1],
"A100X-TensorRT-1": ["a100x", "l0_a100", 1, 4],
"A100X-TensorRT-2": ["a100x", "l0_a100", 2, 4],
"A100X-TensorRT-3": ["a100x", "l0_a100", 3, 4],

View File

@ -1,5 +1,19 @@
version: 0.0.1
l0_a100:
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*a100*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
backend: "pytorch"
tests:
- unittest/llmapi/test_llm_pytorch.py
- condition:
ranges:
system_gpu_count:

View File

@ -16,6 +16,7 @@ l0_dgx_h100:
# ------------- PyTorch tests ---------------
- unittest/_torch/multi_gpu
- unittest/_torch/auto_deploy/unit/multigpu
- unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4"
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM]

View File

@ -1099,12 +1099,8 @@ def tinyllama_guided_decoding_test_harness(**llm_kwargs):
@force_ampere
@pytest.mark.part0
@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch'])
def test_tinyllama_guided_decoding(backend: str):
llm_kwargs = {}
if backend == 'pytorch':
llm_kwargs['backend'] = 'pytorch'
tinyllama_guided_decoding_test_harness(**llm_kwargs)
def test_tinyllama_guided_decoding():
tinyllama_guided_decoding_test_harness()
@pytest.mark.part0
@ -1766,18 +1762,13 @@ def llm_get_stats_test_harness(tp_size: int = 1,
assert llm.get_stats(2)
@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap",
[
(True, False, False),
(False, False, False),
(False, True, False),
(False, True, True),
])
def test_llm_get_stats(return_context_logits, pytorch_backend, use_overlap):
@pytest.mark.parametrize("return_context_logits", [
(True, ),
(False, ),
])
def test_llm_get_stats(return_context_logits):
llm_get_stats_test_harness(tp_size=1,
return_context_logits=return_context_logits,
pytorch_backend=pytorch_backend,
use_overlap=use_overlap)
return_context_logits=return_context_logits)
def llm_get_stats_async_test_harness(tp_size: int = 1,
@ -1833,20 +1824,15 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
asyncio.run(main())
@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap",
[
(True, False, False),
(False, False, False),
(False, True, False),
(False, True, True),
])
def test_llm_get_stats_async(return_context_logits, pytorch_backend,
use_overlap):
@pytest.mark.parametrize("return_context_logits", [
(True, ),
(False, ),
])
def test_llm_get_stats_async(return_context_logits):
llm_get_stats_async_test_harness(
tp_size=1,
return_context_logits=return_context_logits,
pytorch_backend=pytorch_backend,
use_overlap=use_overlap)
)
def test_llm_chunked_prefill():
@ -1986,10 +1972,9 @@ def run_llm_with_postprocess_parallel_and_result_handler(
@pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.parametrize("backend", [None, "pytorch"])
def test_llm_with_postprocess_parallel_and_result_handler(streaming, backend):
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
run_llm_with_postprocess_parallel_and_result_handler(streaming,
backend,
backend=None,
tp_size=1)
@ -2041,41 +2026,6 @@ def test_llm_abort_request(llm_for_sampling_params,
sampling_params=sampling_params)
@force_ampere
@pytest.mark.parametrize(
"sampling_params",
[
SamplingParams() # pytorch only supports n=1
])
def test_llm_abort_request_pytorch(sampling_params):
from tensorrt_llm._torch import LLM as LLM_torch
llm = LLM_torch(model=llama_model_path,
kv_cache_config=global_kvcache_config)
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
def test_llm_reward_model_pytorch():
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
from tensorrt_llm._torch import LLM as LLM_torch
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
llm = LLM_torch(
model=rm_model_path,
pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
sampling_params = SamplingParams(return_context_logits=True)
outputs = llm.generate(prompts, sampling_params)
scores = outputs[0].context_logits
print(scores)
assert scores.shape == (tokenized_input.shape[1], 2)
assert not outputs[0].outputs[0].text
def test_llm_sampling_params_n_lt_max_batch_size():
sampling_params = SamplingParams(n=2, best_of=1)
build_config = BuildConfig(max_batch_size=1, max_seq_len=1024)
@ -2117,7 +2067,3 @@ def test_llm_api_draft_target():
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
test_llm_with_postprocess_parallel_and_result_handler(True, "pytorch")

View File

@ -234,16 +234,11 @@ def test_tinyllama_logits_processor_tp2pp2():
@pytest.mark.gpu4
@pytest.mark.part0
@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch'])
def test_tinyllama_guided_decoding_tp2pp2(backend: str):
llm_kwargs = {}
if backend == 'pytorch':
llm_kwargs['backend'] = 'pytorch'
def test_tinyllama_guided_decoding_tp2pp2():
tinyllama_guided_decoding_test_harness(
tensor_parallel_size=2,
pipeline_parallel_size=2,
kv_cache_config=global_kv_cache_config,
**llm_kwargs)
kv_cache_config=global_kv_cache_config)
@pytest.mark.gpu2

View File

@ -0,0 +1,15 @@
import pytest
# isort: off
from .test_llm import (global_kvcache_config,
tinyllama_guided_decoding_test_harness)
# isort: on
@pytest.mark.gpu4
def test_tinyllama_guided_decoding_tp2pp2():
tinyllama_guided_decoding_test_harness(
tensor_parallel_size=2,
pipeline_parallel_size=2,
kv_cache_config=global_kvcache_config,
backend='pytorch')

View File

@ -0,0 +1,84 @@
import pytest
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.sampling_params import SamplingParams
# isort: off
from .test_llm import (get_model_path, global_kvcache_config, llama_model_path,
llm_get_stats_async_test_harness,
llm_get_stats_test_harness, prompts,
run_llm_abort_request,
run_llm_with_postprocess_parallel_and_result_handler,
tinyllama_guided_decoding_test_harness)
from utils.util import force_ampere
# isort: on
@force_ampere
def test_tinyllama_guided_decoding():
tinyllama_guided_decoding_test_harness(backend="pytorch")
@pytest.mark.parametrize("return_context_logits, use_overlap", [
(False, False),
(False, True),
])
def test_llm_get_stats(return_context_logits, use_overlap):
llm_get_stats_test_harness(tp_size=1,
return_context_logits=return_context_logits,
pytorch_backend=True,
use_overlap=use_overlap)
@pytest.mark.parametrize("return_context_logits, use_overlap", [
(False, False),
(False, True),
])
def test_llm_get_stats_async(return_context_logits, use_overlap):
llm_get_stats_async_test_harness(
tp_size=1,
return_context_logits=return_context_logits,
pytorch_backend=True,
use_overlap=use_overlap)
@force_ampere
@pytest.mark.parametrize(
"sampling_params",
[
SamplingParams() # pytorch only supports n=1
])
def test_llm_abort_request(sampling_params):
from tensorrt_llm._torch import LLM as LLM_torch
llm = LLM_torch(model=llama_model_path,
kv_cache_config=global_kvcache_config)
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
def test_llm_reward_model():
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
from tensorrt_llm._torch import LLM as LLM_torch
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
llm = LLM_torch(
model=rm_model_path,
pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
sampling_params = SamplingParams(return_context_logits=True)
outputs = llm.generate(prompts, sampling_params)
scores = outputs[0].context_logits
print(scores)
assert scores.shape == (tokenized_input.shape[1], 2)
assert not outputs[0].outputs[0].text
@pytest.mark.parametrize("streaming", [True, False])
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
run_llm_with_postprocess_parallel_and_result_handler(streaming,
"pytorch",
tp_size=1)