diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 594adcf168..bcdaba2dd4 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -524,6 +524,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars) "tests/integration/test_lists/test-db/l0_dgx_h200.yml", "tests/unittest/_torch/multi_gpu/", "tests/unittest/_torch/multi_gpu_modeling/", + "tests/unittest/llmapi/test_llm_pytorch.py", + "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py", "jenkins/L0_Test.groovy", ] diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 85fa1e8734..786e28d0b0 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1199,6 +1199,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "A30-TensorRT-2": ["a30", "l0_a30", 2, 4], "A30-TensorRT-3": ["a30", "l0_a30", 3, 4], "A30-TensorRT-4": ["a30", "l0_a30", 4, 4], + "A100X-PyTorch-1": ["a100x", "l0_a100", 1, 1], "A100X-TensorRT-1": ["a100x", "l0_a100", 1, 4], "A100X-TensorRT-2": ["a100x", "l0_a100", 2, 4], "A100X-TensorRT-3": ["a100x", "l0_a100", 3, 4], diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml index f4e87b7771..d4f6cbbda9 100644 --- a/tests/integration/test_lists/test-db/l0_a100.yml +++ b/tests/integration/test_lists/test-db/l0_a100.yml @@ -1,5 +1,19 @@ version: 0.0.1 l0_a100: +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*a100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: "pytorch" + tests: + - unittest/llmapi/test_llm_pytorch.py - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 0cfb98a021..cc5e313a9b 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -16,6 +16,7 @@ l0_dgx_h100: # ------------- PyTorch tests --------------- - unittest/_torch/multi_gpu - unittest/_torch/auto_deploy/unit/multigpu + - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4" - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM] diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index f74a6e0074..4ab35dffb6 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1099,12 +1099,8 @@ def tinyllama_guided_decoding_test_harness(**llm_kwargs): @force_ampere @pytest.mark.part0 -@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch']) -def test_tinyllama_guided_decoding(backend: str): - llm_kwargs = {} - if backend == 'pytorch': - llm_kwargs['backend'] = 'pytorch' - tinyllama_guided_decoding_test_harness(**llm_kwargs) +def test_tinyllama_guided_decoding(): + tinyllama_guided_decoding_test_harness() @pytest.mark.part0 @@ -1766,18 +1762,13 @@ def llm_get_stats_test_harness(tp_size: int = 1, assert llm.get_stats(2) -@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap", - [ - (True, False, False), - (False, False, False), - (False, True, False), - (False, True, True), - ]) -def test_llm_get_stats(return_context_logits, pytorch_backend, use_overlap): +@pytest.mark.parametrize("return_context_logits", [ + (True, ), + (False, ), +]) +def test_llm_get_stats(return_context_logits): llm_get_stats_test_harness(tp_size=1, - return_context_logits=return_context_logits, - pytorch_backend=pytorch_backend, - use_overlap=use_overlap) + return_context_logits=return_context_logits) def llm_get_stats_async_test_harness(tp_size: int = 1, @@ -1833,20 +1824,15 @@ def llm_get_stats_async_test_harness(tp_size: int = 1, asyncio.run(main()) -@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap", - [ - (True, False, False), - (False, False, False), - (False, True, False), - (False, True, True), - ]) -def test_llm_get_stats_async(return_context_logits, pytorch_backend, - use_overlap): +@pytest.mark.parametrize("return_context_logits", [ + (True, ), + (False, ), +]) +def test_llm_get_stats_async(return_context_logits): llm_get_stats_async_test_harness( tp_size=1, return_context_logits=return_context_logits, - pytorch_backend=pytorch_backend, - use_overlap=use_overlap) + ) def test_llm_chunked_prefill(): @@ -1986,10 +1972,9 @@ def run_llm_with_postprocess_parallel_and_result_handler( @pytest.mark.parametrize("streaming", [True, False]) -@pytest.mark.parametrize("backend", [None, "pytorch"]) -def test_llm_with_postprocess_parallel_and_result_handler(streaming, backend): +def test_llm_with_postprocess_parallel_and_result_handler(streaming): run_llm_with_postprocess_parallel_and_result_handler(streaming, - backend, + backend=None, tp_size=1) @@ -2041,41 +2026,6 @@ def test_llm_abort_request(llm_for_sampling_params, sampling_params=sampling_params) -@force_ampere -@pytest.mark.parametrize( - "sampling_params", - [ - SamplingParams() # pytorch only supports n=1 - ]) -def test_llm_abort_request_pytorch(sampling_params): - from tensorrt_llm._torch import LLM as LLM_torch - llm = LLM_torch(model=llama_model_path, - kv_cache_config=global_kvcache_config) - run_llm_abort_request(llm=llm, sampling_params=sampling_params) - - -def test_llm_reward_model_pytorch(): - rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B") - tokenizer = TransformersTokenizer.from_pretrained(rm_model_path) - tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"] - - from tensorrt_llm._torch import LLM as LLM_torch - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - llm = LLM_torch( - model=rm_model_path, - pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA")) - - sampling_params = SamplingParams(return_context_logits=True) - - outputs = llm.generate(prompts, sampling_params) - scores = outputs[0].context_logits - - print(scores) - - assert scores.shape == (tokenized_input.shape[1], 2) - assert not outputs[0].outputs[0].text - - def test_llm_sampling_params_n_lt_max_batch_size(): sampling_params = SamplingParams(n=2, best_of=1) build_config = BuildConfig(max_batch_size=1, max_seq_len=1024) @@ -2117,7 +2067,3 @@ def test_llm_api_draft_target(): prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -if __name__ == '__main__': - test_llm_with_postprocess_parallel_and_result_handler(True, "pytorch") diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index e03672e7b3..39f398f359 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -234,16 +234,11 @@ def test_tinyllama_logits_processor_tp2pp2(): @pytest.mark.gpu4 @pytest.mark.part0 -@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch']) -def test_tinyllama_guided_decoding_tp2pp2(backend: str): - llm_kwargs = {} - if backend == 'pytorch': - llm_kwargs['backend'] = 'pytorch' +def test_tinyllama_guided_decoding_tp2pp2(): tinyllama_guided_decoding_test_harness( tensor_parallel_size=2, pipeline_parallel_size=2, - kv_cache_config=global_kv_cache_config, - **llm_kwargs) + kv_cache_config=global_kv_cache_config) @pytest.mark.gpu2 diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py new file mode 100644 index 0000000000..9689b765f4 --- /dev/null +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -0,0 +1,15 @@ +import pytest + +# isort: off +from .test_llm import (global_kvcache_config, + tinyllama_guided_decoding_test_harness) +# isort: on + + +@pytest.mark.gpu4 +def test_tinyllama_guided_decoding_tp2pp2(): + tinyllama_guided_decoding_test_harness( + tensor_parallel_size=2, + pipeline_parallel_size=2, + kv_cache_config=global_kvcache_config, + backend='pytorch') diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py new file mode 100644 index 0000000000..20ee44a093 --- /dev/null +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -0,0 +1,84 @@ +import pytest + +from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer +from tensorrt_llm.sampling_params import SamplingParams + +# isort: off +from .test_llm import (get_model_path, global_kvcache_config, llama_model_path, + llm_get_stats_async_test_harness, + llm_get_stats_test_harness, prompts, + run_llm_abort_request, + run_llm_with_postprocess_parallel_and_result_handler, + tinyllama_guided_decoding_test_harness) +from utils.util import force_ampere +# isort: on + + +@force_ampere +def test_tinyllama_guided_decoding(): + tinyllama_guided_decoding_test_harness(backend="pytorch") + + +@pytest.mark.parametrize("return_context_logits, use_overlap", [ + (False, False), + (False, True), +]) +def test_llm_get_stats(return_context_logits, use_overlap): + llm_get_stats_test_harness(tp_size=1, + return_context_logits=return_context_logits, + pytorch_backend=True, + use_overlap=use_overlap) + + +@pytest.mark.parametrize("return_context_logits, use_overlap", [ + (False, False), + (False, True), +]) +def test_llm_get_stats_async(return_context_logits, use_overlap): + llm_get_stats_async_test_harness( + tp_size=1, + return_context_logits=return_context_logits, + pytorch_backend=True, + use_overlap=use_overlap) + + +@force_ampere +@pytest.mark.parametrize( + "sampling_params", + [ + SamplingParams() # pytorch only supports n=1 + ]) +def test_llm_abort_request(sampling_params): + from tensorrt_llm._torch import LLM as LLM_torch + llm = LLM_torch(model=llama_model_path, + kv_cache_config=global_kvcache_config) + run_llm_abort_request(llm=llm, sampling_params=sampling_params) + + +def test_llm_reward_model(): + rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B") + tokenizer = TransformersTokenizer.from_pretrained(rm_model_path) + tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"] + + from tensorrt_llm._torch import LLM as LLM_torch + from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig + llm = LLM_torch( + model=rm_model_path, + pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA")) + + sampling_params = SamplingParams(return_context_logits=True) + + outputs = llm.generate(prompts, sampling_params) + scores = outputs[0].context_logits + + print(scores) + + assert scores.shape == (tokenized_input.shape[1], 2) + assert not outputs[0].outputs[0].text + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_llm_with_postprocess_parallel_and_result_handler(streaming): + run_llm_with_postprocess_parallel_and_result_handler(streaming, + "pytorch", + tp_size=1)