diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index da0093bc13..88b5dd0f00 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -84,11 +84,10 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): attn_backend=attn_backend, # https://nvbugspro.nvidia.com/bug/5345391 disable_overlap_scheduler=True) - llm = LLM(self.MODEL_PATH, - enable_chunked_prefill=True, - max_num_tokens=512, - **pytorch_config) - with llm: + with LLM(self.MODEL_PATH, + enable_chunked_prefill=True, + max_num_tokens=512, + **pytorch_config) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) @@ -107,8 +106,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, ) - llm = LLM(self.MODEL_PATH, **pytorch_config) - with llm: + with LLM(self.MODEL_PATH, **pytorch_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -134,11 +132,10 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, ) - llm = LLM(self.MODEL_PATH, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - **pytorch_config) - with llm: + with LLM(self.MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + **pytorch_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -162,14 +159,13 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_config["kv_cache_dtype"] = "fp8" - llm = LLM( - f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8", - quant_config=quant_config, - **pytorch_config) - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: + with LLM( + f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8", + quant_config=quant_config, + **pytorch_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -201,31 +197,30 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_config["kv_cache_dtype"] = "fp8" - llm = LLM( - f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - quant_config=quant_config, - **pytorch_config) - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: + with LLM( + f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + quant_config=quant_config, + **pytorch_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @skip_pre_hopper def test_fp8_llm_sampler(self): model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" - llm = LLM(model_path, enable_trtllm_sampler=True, max_batch_size=256) - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + with LLM(model_path, enable_trtllm_sampler=True, + max_batch_size=256) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - ) + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + ) - with llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm, sampling_params=sampling_params, @@ -245,13 +240,11 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): spec_config = EagleDecodingConfig(max_draft_len=draft_len, speculative_model_dir=eagle_model_dir) - llm = LLM(model=target_model_dir, - **pytorch_config, - kv_cache_config=kv_cache_config, - speculative_config=spec_config, - build_config=None) - - with llm: + with LLM(model=target_model_dir, + **pytorch_config, + kv_cache_config=kv_cache_config, + speculative_config=spec_config, + build_config=None) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) @@ -269,12 +262,10 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): is_public_pool=True, ) - llm = LLM(model=self.MODEL_PATH, - **pytorch_config, - kv_cache_config=kv_cache_config, - speculative_config=spec_config) - - with llm: + with LLM(model=self.MODEL_PATH, + **pytorch_config, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) @@ -291,17 +282,17 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): task = JsonModeEval(self.MODEL_NAME) task.evaluate(llm) + @pytest.mark.timeout(7200) @pytest.mark.skip_less_device(4) @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"]) def test_guided_decoding_4gpus(self, backend: str, mocker): mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"}) - llm = LLM(self.MODEL_PATH, - guided_decoding_backend=backend, - disable_overlap_scheduler=True, - cuda_graph_config=CudaGraphConfig(), - tensor_parallel_size=2, - pipeline_parallel_size=2) - with llm: + with LLM(self.MODEL_PATH, + guided_decoding_backend=backend, + disable_overlap_scheduler=True, + cuda_graph_config=CudaGraphConfig(), + tensor_parallel_size=2, + pipeline_parallel_size=2) as llm: task = JsonModeEval(self.MODEL_NAME) task.evaluate(llm) @@ -591,12 +582,11 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM(self.MODEL_PATH, - kv_cache_config=kv_cache_config, - **pytorch_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) - with llm: + with LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + **pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -634,15 +624,14 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM(self.MODEL_PATH, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - kv_cache_config=kv_cache_config, - **pytorch_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) - with llm: + with LLM(self.MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -686,18 +675,17 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn, use_mtp_vanilla=True) - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -742,20 +730,19 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM( - f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config, - ) + with LLM( + f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config, + ) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -773,12 +760,11 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): padding_enabled=True, ), ) - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - kv_cache_config=kv_cache_config, - **pytorch_config, - speculative_config=mtp_config) - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - with llm: + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + kv_cache_config=kv_cache_config, + **pytorch_config, + speculative_config=mtp_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -799,15 +785,14 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - tensor_parallel_size=4, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - with llm: + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + tensor_parallel_size=4, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -856,21 +841,20 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -926,23 +910,21 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM( - f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config, - ) + with LLM( + f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config, + ) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -966,13 +948,12 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(), moe_backend="WIDEEP", moe_load_balancer=eplb_config) - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", - tensor_parallel_size=4, - moe_expert_parallel_size=4, - kv_cache_config=kv_cache_config, - **pytorch_backend_options, - enable_attention_dp=True) - with llm: + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + tensor_parallel_size=4, + moe_expert_parallel_size=4, + kv_cache_config=kv_cache_config, + **pytorch_backend_options, + enable_attention_dp=True) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -990,14 +971,13 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM(self.MODEL_PATH, - tensor_parallel_size=4, - moe_expert_parallel_size=4, - kv_cache_config=kv_cache_config, - enable_attention_dp=True, - **pytorch_config, - speculative_config=mtp_config) - with llm: + with LLM(self.MODEL_PATH, + tensor_parallel_size=4, + moe_expert_parallel_size=4, + kv_cache_config=kv_cache_config, + enable_attention_dp=True, + **pytorch_config, + speculative_config=mtp_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1018,14 +998,13 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_backend_options["kv_cache_dtype"] = "fp8" - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only", - tensor_parallel_size=4, - moe_expert_parallel_size=4, - kv_cache_config=kv_cache_config, - **pytorch_backend_options, - enable_attention_dp=True, - quant_config=quant_config) - with llm: + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only", + tensor_parallel_size=4, + moe_expert_parallel_size=4, + kv_cache_config=kv_cache_config, + **pytorch_backend_options, + enable_attention_dp=True, + quant_config=quant_config) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1066,18 +1045,16 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_config["kv_cache_dtype"] = "fp8" - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp", - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp", + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1130,21 +1107,19 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_config["kv_cache_dtype"] = "fp8" - llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) + with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1195,22 +1170,19 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_config["kv_cache_dtype"] = "fp8" - llm = LLM(model_path, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) + with LLM(model_path, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + if quant_dtype == "fp8": + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + elif quant_dtype == "nvfp4": + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - if quant_dtype == "fp8": - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - elif quant_dtype == "nvfp4": - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - - with llm: + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1258,24 +1230,23 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): quant_config.kv_cache_quant_algo = QuantAlgo.FP8 pytorch_config["kv_cache_dtype"] = "fp8" - llm = LLM(model_path, - kv_cache_config=kv_cache_config, - enable_chunked_prefill=True, - max_num_tokens=512, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=True, - speculative_config=mtp_config) + with LLM(model_path, + kv_cache_config=kv_cache_config, + enable_chunked_prefill=True, + max_num_tokens=512, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=True, + speculative_config=mtp_config) as llm: - if quant_dtype == "fp8": - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - elif quant_dtype == "nvfp4": - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + if quant_dtype == "fp8": + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + elif quant_dtype == "nvfp4": + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1370,23 +1341,22 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4", - max_batch_size=max_batch_size, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) + with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4", + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: - assert llm.args.moe_backend == moe_backend - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 + assert llm.args.moe_backend == moe_backend + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) @@ -1421,21 +1391,20 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) - llm = LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1", - max_batch_size=max_batch_size, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - kv_cache_config=kv_cache_config, - **pytorch_config, - quant_config=quant_config, - enable_attention_dp=attention_dp, - speculative_config=mtp_config) - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES - if fp8kv: - assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 + with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1", + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + quant_config=quant_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + if fp8kv: + assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 - with llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) @@ -1646,13 +1615,12 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp) - with llm: + with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) @@ -1668,13 +1636,12 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp) - with llm: + with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) @@ -1696,13 +1663,12 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp) - with llm: + with LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1717,14 +1683,12 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - llm = LLM( - f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp) - with llm: + with LLM(f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1764,14 +1728,13 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): moe_backend=moe_backend, ) - llm = LLM( - f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp) - with llm: + with LLM( + f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: task = GSM8K(self.MODEL_NAME) task.evaluate(llm) @@ -1790,13 +1753,12 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness): disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp) - with llm: + with LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) @@ -1819,15 +1781,14 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness): cuda_graph_config=CudaGraphConfig() if cuda_graph else None) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) - llm = LLM( - f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp, - kv_cache_config=kv_cache_config) - with llm: + with LLM( + f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) @@ -1849,15 +1810,14 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness): moe_backend=moe_backend) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) - llm = LLM( - f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - moe_expert_parallel_size=ep_size, - **pytorch_config, - enable_attention_dp=attention_dp, - kv_cache_config=kv_cache_config) - with llm: + with LLM( + f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 0c5cd7ca86..4a5b31f0a2 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -169,8 +169,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instr examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] @@ -194,8 +192,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] # Multimodal Executor Cpp E2E Tests -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt index 666d3fda99..91a71d1c23 100644 --- a/tests/integration/test_lists/qa/llm_sanity_test.txt +++ b/tests/integration/test_lists/qa/llm_sanity_test.txt @@ -1,157 +1,102 @@ -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased] -examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-enable_weight_only] -examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only] -examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_logits-draft_len_8-float16-bs1] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1] -examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] -examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] -examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8] -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-enable_fp8] -examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-disable_fp8] -examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1] -examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1] -examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] -examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu_vswa[gemma-3-1b-it-fp8-bfloat16-8] -examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] -examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp8] -examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_4] -examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] -examples/test_llama.py::test_llm_llama_v1_4gpu_paged_kv_cache[llama-3.1-8b] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-1.3b-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-370m-float16-enable_gemm_plugin] -examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] -examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] -examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] -examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] -examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] -# Multimodal Executor Cpp E2E Tests -examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] - -examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] -examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq] -examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-fp8] -examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-full_prec] -examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] -examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8] -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] -examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] -examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime] -test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] -test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] -test_e2e.py::test_llama_e2e[use_py_session--] -llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-codellama/CodeLlama-7b-Instruct-hf] # 5min -llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf] # 5min -test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--] -test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--] -test_e2e.py::test_mistral_e2e[use_py_session---] -test_e2e.py::test_openai_multi_chat_example -test_e2e.py::test_openai_consistent_chat - -# Accuracy test list -accuracy/test_cli_flow.py::TestStarcoder2_3B::test_auto_dtype -accuracy/test_cli_flow.py::TestMinitron4BBase::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8 -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant] -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq -accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_medusa_fp8_prequantized -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 -accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4 -accuracy/test_cli_flow.py::TestMistral7B::test_fp8_tp4pp2 -accuracy/test_cli_flow.py::TestMistral7B::test_beam_search -accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int4_tp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel] -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 -accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] +accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] +accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram +accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype_gsm8k +accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 +accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 +accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 +accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 +accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm] -accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype - -# Pivot to Pytorch test cases. +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] +disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] +test_e2e.py::test_openai_consistent_chat +test_e2e.py::test_openai_multi_chat_example +test_e2e.py::test_ptp_quickstart +test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] +test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B] +test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] +test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1] +test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] +test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1] +test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] -test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1] -test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] +test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] +test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] +test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] +test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] +test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] -test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] +test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] +test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] +test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False] +test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False] +test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-True] +test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False] +test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True] test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] +test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] +test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B] +test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1] test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False] - -# PyTorch flow disaggregated tests -disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] +test_e2e.py::test_trtllm_benchmark_serving diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 557b43d2b8..b7c365dbd5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -449,3 +449,9 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987) examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914) +test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5387375) +examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423) +examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423) +examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422) +examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424) +test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)