mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
move pytorch tests of LLM API into separate test files (#3745)
* move pytorch tests of LLM API into separate test files Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * polish Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
b16a127026
commit
257abfbc51
@ -524,6 +524,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
|
||||
"tests/integration/test_lists/test-db/l0_dgx_h200.yml",
|
||||
"tests/unittest/_torch/multi_gpu/",
|
||||
"tests/unittest/_torch/multi_gpu_modeling/",
|
||||
"tests/unittest/llmapi/test_llm_pytorch.py",
|
||||
"tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
|
||||
"jenkins/L0_Test.groovy",
|
||||
]
|
||||
|
||||
|
||||
@ -1199,6 +1199,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
"A30-TensorRT-2": ["a30", "l0_a30", 2, 4],
|
||||
"A30-TensorRT-3": ["a30", "l0_a30", 3, 4],
|
||||
"A30-TensorRT-4": ["a30", "l0_a30", 4, 4],
|
||||
"A100X-PyTorch-1": ["a100x", "l0_a100", 1, 1],
|
||||
"A100X-TensorRT-1": ["a100x", "l0_a100", 1, 4],
|
||||
"A100X-TensorRT-2": ["a100x", "l0_a100", 2, 4],
|
||||
"A100X-TensorRT-3": ["a100x", "l0_a100", 3, 4],
|
||||
|
||||
@ -1,5 +1,19 @@
|
||||
version: 0.0.1
|
||||
l0_a100:
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*a100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: "pytorch"
|
||||
tests:
|
||||
- unittest/llmapi/test_llm_pytorch.py
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -16,6 +16,7 @@ l0_dgx_h100:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- unittest/_torch/multi_gpu
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4"
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM]
|
||||
|
||||
@ -1099,12 +1099,8 @@ def tinyllama_guided_decoding_test_harness(**llm_kwargs):
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.part0
|
||||
@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch'])
|
||||
def test_tinyllama_guided_decoding(backend: str):
|
||||
llm_kwargs = {}
|
||||
if backend == 'pytorch':
|
||||
llm_kwargs['backend'] = 'pytorch'
|
||||
tinyllama_guided_decoding_test_harness(**llm_kwargs)
|
||||
def test_tinyllama_guided_decoding():
|
||||
tinyllama_guided_decoding_test_harness()
|
||||
|
||||
|
||||
@pytest.mark.part0
|
||||
@ -1766,18 +1762,13 @@ def llm_get_stats_test_harness(tp_size: int = 1,
|
||||
assert llm.get_stats(2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap",
|
||||
[
|
||||
(True, False, False),
|
||||
(False, False, False),
|
||||
(False, True, False),
|
||||
(False, True, True),
|
||||
])
|
||||
def test_llm_get_stats(return_context_logits, pytorch_backend, use_overlap):
|
||||
@pytest.mark.parametrize("return_context_logits", [
|
||||
(True, ),
|
||||
(False, ),
|
||||
])
|
||||
def test_llm_get_stats(return_context_logits):
|
||||
llm_get_stats_test_harness(tp_size=1,
|
||||
return_context_logits=return_context_logits,
|
||||
pytorch_backend=pytorch_backend,
|
||||
use_overlap=use_overlap)
|
||||
return_context_logits=return_context_logits)
|
||||
|
||||
|
||||
def llm_get_stats_async_test_harness(tp_size: int = 1,
|
||||
@ -1833,20 +1824,15 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap",
|
||||
[
|
||||
(True, False, False),
|
||||
(False, False, False),
|
||||
(False, True, False),
|
||||
(False, True, True),
|
||||
])
|
||||
def test_llm_get_stats_async(return_context_logits, pytorch_backend,
|
||||
use_overlap):
|
||||
@pytest.mark.parametrize("return_context_logits", [
|
||||
(True, ),
|
||||
(False, ),
|
||||
])
|
||||
def test_llm_get_stats_async(return_context_logits):
|
||||
llm_get_stats_async_test_harness(
|
||||
tp_size=1,
|
||||
return_context_logits=return_context_logits,
|
||||
pytorch_backend=pytorch_backend,
|
||||
use_overlap=use_overlap)
|
||||
)
|
||||
|
||||
|
||||
def test_llm_chunked_prefill():
|
||||
@ -1986,10 +1972,9 @@ def run_llm_with_postprocess_parallel_and_result_handler(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
@pytest.mark.parametrize("backend", [None, "pytorch"])
|
||||
def test_llm_with_postprocess_parallel_and_result_handler(streaming, backend):
|
||||
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
|
||||
run_llm_with_postprocess_parallel_and_result_handler(streaming,
|
||||
backend,
|
||||
backend=None,
|
||||
tp_size=1)
|
||||
|
||||
|
||||
@ -2041,41 +2026,6 @@ def test_llm_abort_request(llm_for_sampling_params,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.parametrize(
|
||||
"sampling_params",
|
||||
[
|
||||
SamplingParams() # pytorch only supports n=1
|
||||
])
|
||||
def test_llm_abort_request_pytorch(sampling_params):
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm = LLM_torch(model=llama_model_path,
|
||||
kv_cache_config=global_kvcache_config)
|
||||
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
|
||||
|
||||
|
||||
def test_llm_reward_model_pytorch():
|
||||
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
|
||||
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
|
||||
tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
|
||||
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
llm = LLM_torch(
|
||||
model=rm_model_path,
|
||||
pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
|
||||
|
||||
sampling_params = SamplingParams(return_context_logits=True)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
scores = outputs[0].context_logits
|
||||
|
||||
print(scores)
|
||||
|
||||
assert scores.shape == (tokenized_input.shape[1], 2)
|
||||
assert not outputs[0].outputs[0].text
|
||||
|
||||
|
||||
def test_llm_sampling_params_n_lt_max_batch_size():
|
||||
sampling_params = SamplingParams(n=2, best_of=1)
|
||||
build_config = BuildConfig(max_batch_size=1, max_seq_len=1024)
|
||||
@ -2117,7 +2067,3 @@ def test_llm_api_draft_target():
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_llm_with_postprocess_parallel_and_result_handler(True, "pytorch")
|
||||
|
||||
@ -234,16 +234,11 @@ def test_tinyllama_logits_processor_tp2pp2():
|
||||
|
||||
@pytest.mark.gpu4
|
||||
@pytest.mark.part0
|
||||
@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch'])
|
||||
def test_tinyllama_guided_decoding_tp2pp2(backend: str):
|
||||
llm_kwargs = {}
|
||||
if backend == 'pytorch':
|
||||
llm_kwargs['backend'] = 'pytorch'
|
||||
def test_tinyllama_guided_decoding_tp2pp2():
|
||||
tinyllama_guided_decoding_test_harness(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2,
|
||||
kv_cache_config=global_kv_cache_config,
|
||||
**llm_kwargs)
|
||||
kv_cache_config=global_kv_cache_config)
|
||||
|
||||
|
||||
@pytest.mark.gpu2
|
||||
|
||||
15
tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
Normal file
15
tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
Normal file
@ -0,0 +1,15 @@
|
||||
import pytest
|
||||
|
||||
# isort: off
|
||||
from .test_llm import (global_kvcache_config,
|
||||
tinyllama_guided_decoding_test_harness)
|
||||
# isort: on
|
||||
|
||||
|
||||
@pytest.mark.gpu4
|
||||
def test_tinyllama_guided_decoding_tp2pp2():
|
||||
tinyllama_guided_decoding_test_harness(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2,
|
||||
kv_cache_config=global_kvcache_config,
|
||||
backend='pytorch')
|
||||
84
tests/unittest/llmapi/test_llm_pytorch.py
Normal file
84
tests/unittest/llmapi/test_llm_pytorch.py
Normal file
@ -0,0 +1,84 @@
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
# isort: off
|
||||
from .test_llm import (get_model_path, global_kvcache_config, llama_model_path,
|
||||
llm_get_stats_async_test_harness,
|
||||
llm_get_stats_test_harness, prompts,
|
||||
run_llm_abort_request,
|
||||
run_llm_with_postprocess_parallel_and_result_handler,
|
||||
tinyllama_guided_decoding_test_harness)
|
||||
from utils.util import force_ampere
|
||||
# isort: on
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_tinyllama_guided_decoding():
|
||||
tinyllama_guided_decoding_test_harness(backend="pytorch")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_context_logits, use_overlap", [
|
||||
(False, False),
|
||||
(False, True),
|
||||
])
|
||||
def test_llm_get_stats(return_context_logits, use_overlap):
|
||||
llm_get_stats_test_harness(tp_size=1,
|
||||
return_context_logits=return_context_logits,
|
||||
pytorch_backend=True,
|
||||
use_overlap=use_overlap)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_context_logits, use_overlap", [
|
||||
(False, False),
|
||||
(False, True),
|
||||
])
|
||||
def test_llm_get_stats_async(return_context_logits, use_overlap):
|
||||
llm_get_stats_async_test_harness(
|
||||
tp_size=1,
|
||||
return_context_logits=return_context_logits,
|
||||
pytorch_backend=True,
|
||||
use_overlap=use_overlap)
|
||||
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.parametrize(
|
||||
"sampling_params",
|
||||
[
|
||||
SamplingParams() # pytorch only supports n=1
|
||||
])
|
||||
def test_llm_abort_request(sampling_params):
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm = LLM_torch(model=llama_model_path,
|
||||
kv_cache_config=global_kvcache_config)
|
||||
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
|
||||
|
||||
|
||||
def test_llm_reward_model():
|
||||
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
|
||||
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
|
||||
tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
|
||||
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
llm = LLM_torch(
|
||||
model=rm_model_path,
|
||||
pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
|
||||
|
||||
sampling_params = SamplingParams(return_context_logits=True)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
scores = outputs[0].context_logits
|
||||
|
||||
print(scores)
|
||||
|
||||
assert scores.shape == (tokenized_input.shape[1], 2)
|
||||
assert not outputs[0].outputs[0].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
|
||||
run_llm_with_postprocess_parallel_and_result_handler(streaming,
|
||||
"pytorch",
|
||||
tp_size=1)
|
||||
Loading…
Reference in New Issue
Block a user