From f93ac9672ef81d289bff0116da9f72c5b299000f Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Tue, 25 Mar 2025 21:55:08 +0800 Subject: [PATCH] clean (#3061) Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- .../accuracy/references/cnn_dailymail.yaml | 17 +++---- .../defs/accuracy/test_accuracy.py | 45 +++++++------------ .../test_lists/qa/examples_test_list.txt | 6 +-- .../integration/test_lists/test-db/l0_a10.yml | 4 +- .../test_lists/test-db/l0_h100.yml | 6 +-- 5 files changed, 29 insertions(+), 49 deletions(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 2b040472a5..8486fc004b 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -21,17 +21,6 @@ gpt2-medium: accuracy: 22.249 gpt-next: - accuracy: 25.516 -EleutherAI/gpt-j-6b: - - accuracy: 27.883 - - dtype: float32 - accuracy: 26.449 - - quant_algo: FP8 - kv_cache_quant_algo: FP8 - accuracy: 27.883 - - extra_acc_spec: max_attention_window_size=960 - accuracy: 27.822 - - extra_acc_spec: max_attention_window_size=960;beam_width=4 - accuracy: 0 microsoft/phi-2: - accuracy: 31.255 microsoft/Phi-3-mini-4k-instruct: @@ -77,6 +66,8 @@ meta-llama/Llama-2-7b-hf: accuracy: 27.999 TinyLlama/TinyLlama-1.1B-Chat-v1.0: - accuracy: 28.328 + - dtype: float32 + accuracy: 28.082 - quant_algo: W8A16 accuracy: 28.003 - quant_algo: W8A16 @@ -141,6 +132,10 @@ meta-llama/Llama-3.2-1B: - quant_algo: FP8_PER_CHANNEL_PER_TOKEN extra_acc_spec: meta_recipe accuracy: 27.614 + - extra_acc_spec: max_attention_window_size=960 + accuracy: 27.259 + - extra_acc_spec: max_attention_window_size=960;beam_width=4 + accuracy: 0 mistralai/Mixtral-8x7B-v0.1: - accuracy: 28.810 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_accuracy.py b/tests/integration/defs/accuracy/test_accuracy.py index 190fe98ac7..c0323ff321 100644 --- a/tests/integration/defs/accuracy/test_accuracy.py +++ b/tests/integration/defs/accuracy/test_accuracy.py @@ -174,36 +174,6 @@ class TestMinitron4BBase(AccuracyTestHarness): kv_cache_quant_algo=QuantAlgo.FP8) -class TestGptJ6B(AccuracyTestHarness): - MODEL_NAME = "EleutherAI/gpt-j-6b" - MODEL_PATH = f"{llm_models_root()}/gpt-j-6b" - EXAMPLE_FOLDER = "models/contrib/gptj" - - def test_auto_dtype(self): - # float16 - self.run(dtype='auto') - - def test_float32(self): - self.run(dtype='float32') - - @skip_pre_ada - def test_fp8(self): - self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) - - @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352") - def test_cyclic_kv_cache(self): - self.run(extra_acc_spec="max_attention_window_size=960", - extra_summarize_args=["--max_attention_window_size=960"]) - - @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352") - def test_cyclic_kv_cache_beam_search(self): - self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4", - extra_build_args=["--max_beam_width=4"], - extra_summarize_args=[ - "--max_attention_window_size=960", "--num_beams=4" - ]) - - class TestPhi2(AccuracyTestHarness): MODEL_NAME = "microsoft/phi-2" MODEL_PATH = f"{llm_models_root()}/phi-2" @@ -486,6 +456,9 @@ class TestTinyLlama1_1BChat(AccuracyTestHarness): def test_auto_dtype(self): self.run(dtype='auto') + def test_float32(self): + self.run(dtype='float32') + @pytest.mark.parametrize("precision", ["int8", "int4"]) def test_weight_only(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 @@ -768,6 +741,18 @@ class TestLlama3_2_1B(AccuracyTestHarness): self.extra_summarize_args = [f"--gpu_weights_percent={gpu_percent}"] self.evaluate() + def test_cyclic_kv_cache(self): + self.run(extra_acc_spec="max_attention_window_size=960", + extra_summarize_args=["--max_attention_window_size=960"]) + + @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352") + def test_cyclic_kv_cache_beam_search(self): + self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4", + extra_build_args=["--max_beam_width=4"], + extra_summarize_args=[ + "--max_attention_window_size=960", "--num_beams=4" + ]) + class TestMixtral8x7B(AccuracyTestHarness): MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1" diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 8e20c8bd6b..36c3c7a907 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -349,9 +349,6 @@ accuracy/test_accuracy.py::TestStarcoder2_15B::test_smooth_quant_ootb accuracy/test_accuracy.py::TestGptNext::test_auto_dtype accuracy/test_accuracy.py::TestMinitron4BBase::test_auto_dtype accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8 -accuracy/test_accuracy.py::TestGptJ6B::test_float32 -accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache -accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search accuracy/test_accuracy.py::TestPhi2::test_auto_dtype accuracy/test_accuracy.py::TestPhi2::test_tp2 accuracy/test_accuracy.py::TestPhi3Mini4kInstruct::test_auto_dtype @@ -388,6 +385,7 @@ accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_tp2 accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_pre_quantized_tp2 accuracy/test_accuracy.py::TestLlama2_7B::test_int4_gptq_pre_quantized_tp2 accuracy/test_accuracy.py::TestLlama2_7B::test_weight_sparsity +accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32 accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int8] accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int4] accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8] @@ -422,6 +420,8 @@ accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8 accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_pp2 accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_rowwise accuracy/test_accuracy.py::TestLlama3_2_1B::test_weight_streaming[1.0] +accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache +accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2 accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights accuracy/test_accuracy.py::TestMixtral8x7B::test_nvfp4_pre_quantized diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index c8c2d21922..841127f486 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -112,9 +112,9 @@ l0_a10: - examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-disable_gemm_plugin] - examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] # 4 mins - accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min - - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache - - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search - accuracy/test_accuracy.py::TestLlama7B::test_auto_dtype # 2 mins + - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache + - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search - accuracy/test_accuracy.py::TestGpt2::test_attention_ootb - accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype - accuracy/test_accuracy.py::TestPhi2::test_auto_dtype # 2 mins diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index d131f82b49..1e5d37835a 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -123,9 +123,6 @@ l0_h100: - accuracy/test_accuracy.py::TestGpt2::test_cuda_graph # 1 min - accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min - accuracy/test_accuracy.py::TestGptNext::test_auto_dtype # 1.5 mins - - accuracy/test_accuracy.py::TestGptJ6B::test_float32 # 4 mins - - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache - - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search - accuracy/test_accuracy.py::TestSantacoder::test_auto_dtype # 1.5 mins - accuracy/test_accuracy.py::TestMamba130M::test_auto_dtype # 1 min - accuracy/test_accuracy.py::TestVicuna7B::test_lookahead # 5 mins @@ -139,11 +136,14 @@ l0_h100: - accuracy/test_accuracy.py::TestGpt2::test_attention_ootb - accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype - accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8 + - accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32 - accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context - accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl - accuracy/test_accuracy.py::TestLlama3_1_8B::test_fp8 - accuracy/test_accuracy.py::TestLlama3_1_8B::test_autoq - accuracy/test_accuracy.py::TestLlama3_2_1B::test_auto_dtype + - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache + - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search - accuracy/test_accuracy.py::TestGemma2_9BIt::test_auto_dtype - examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] - examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]