# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM from tensorrt_llm.sampling_params import SamplingParams from ..conftest import llm_models_root from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness class TestLlama3_1_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B" def get_default_kwargs(self): return { 'skip_tokenizer_init': False, 'trust_remote_code': True, # TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/7142): # AutoDeploy does not support cache reuse yet. 'kv_cache_config': { 'enable_block_reuse': False, }, 'max_batch_size': 512, # 131072 is the max seq len for the model 'max_seq_len': 8192, # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs. # Set it explicitly here to 8192 which is the default in build_config. 'max_num_tokens': 8192, 'skip_loading_weights': False, 'compile_backend': 'torch-opt', 'free_mem_ratio': 0.7, 'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] } def get_default_sampling_params(self): eos_id = -1 beam_width = 1 return SamplingParams(end_id=eos_id, pad_id=eos_id, n=beam_width, use_beam_search=beam_width > 1) @pytest.mark.skip_less_device_memory(32000) def test_auto_dtype(self): kwargs = self.get_default_kwargs() sampling_params = self.get_default_sampling_params() with AutoDeployLLM(model=self.MODEL_PATH, tokenizer=self.MODEL_PATH, **kwargs) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm, sampling_params=sampling_params)