diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index f847104866..649e826207 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1770,29 +1770,28 @@ class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(2) @pytest.mark.skip_less_device_memory(80000) def test_auto_dtype_tp2(self): - with LLM(self.MODEL_PATH, tensor_parallel_size=2) as llm: + with LLM(self.MODEL_PATH, + tensor_parallel_size=2, + max_seq_len=8192, + max_batch_size=64) as llm: + # Run only one eval as maximal BS is not large task = MMLU(self.MODEL_NAME) task.evaluate(llm) - task = GSM8K(self.MODEL_NAME) - task.evaluate(llm) - task = GPQADiamond(self.MODEL_NAME) - task.evaluate(llm, - extra_evaluator_kwargs=dict(apply_chat_template=True)) @skip_pre_hopper @pytest.mark.skip_less_device(2) @pytest.mark.skip_device_not_contain(["H100", "B200"]) def test_fp8_prequantized_tp2(self): model_path = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8" - with LLM(model_path, tensor_parallel_size=2) as llm: + with LLM(model_path, + tensor_parallel_size=2, + max_seq_len=8192, + max_batch_size=64) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + + # Run only one eval as maximal BS is not large task = MMLU(self.MODEL_NAME) task.evaluate(llm) - task = GSM8K(self.MODEL_NAME) - task.evaluate(llm) - task = GPQADiamond(self.MODEL_NAME) - task.evaluate(llm, - extra_evaluator_kwargs=dict(apply_chat_template=True)) class TestLlama3_1NemotronNano8Bv1(LlmapiAccuracyTestHarness):