From f93ac9672ef81d289bff0116da9f72c5b299000f Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Tue, 25 Mar 2025 21:55:08 +0800
Subject: [PATCH] clean (#3061)

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Co-authored-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    | 17 +++----
 .../defs/accuracy/test_accuracy.py            | 45 +++++++------------
 .../test_lists/qa/examples_test_list.txt      |  6 +--
 .../integration/test_lists/test-db/l0_a10.yml |  4 +-
 .../test_lists/test-db/l0_h100.yml            |  6 +--
 5 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index 2b040472a5..8486fc004b 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -21,17 +21,6 @@ gpt2-medium:
     accuracy: 22.249
 gpt-next:
   - accuracy: 25.516
-EleutherAI/gpt-j-6b:
-  - accuracy: 27.883
-  - dtype: float32
-    accuracy: 26.449
-  - quant_algo: FP8
-    kv_cache_quant_algo: FP8
-    accuracy: 27.883
-  - extra_acc_spec: max_attention_window_size=960
-    accuracy: 27.822
-  - extra_acc_spec: max_attention_window_size=960;beam_width=4
-    accuracy: 0
 microsoft/phi-2:
   - accuracy: 31.255
 microsoft/Phi-3-mini-4k-instruct:
@@ -77,6 +66,8 @@ meta-llama/Llama-2-7b-hf:
     accuracy: 27.999
 TinyLlama/TinyLlama-1.1B-Chat-v1.0:
   - accuracy: 28.328
+  - dtype: float32
+    accuracy: 28.082
   - quant_algo: W8A16
     accuracy: 28.003
   - quant_algo: W8A16
@@ -141,6 +132,10 @@ meta-llama/Llama-3.2-1B:
   - quant_algo: FP8_PER_CHANNEL_PER_TOKEN
     extra_acc_spec: meta_recipe
     accuracy: 27.614
+  - extra_acc_spec: max_attention_window_size=960
+    accuracy: 27.259
+  - extra_acc_spec: max_attention_window_size=960;beam_width=4
+    accuracy: 0
 mistralai/Mixtral-8x7B-v0.1:
   - accuracy: 28.810
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_accuracy.py b/tests/integration/defs/accuracy/test_accuracy.py
index 190fe98ac7..c0323ff321 100644
--- a/tests/integration/defs/accuracy/test_accuracy.py
+++ b/tests/integration/defs/accuracy/test_accuracy.py
@@ -174,36 +174,6 @@ class TestMinitron4BBase(AccuracyTestHarness):
                  kv_cache_quant_algo=QuantAlgo.FP8)
 
 
-class TestGptJ6B(AccuracyTestHarness):
-    MODEL_NAME = "EleutherAI/gpt-j-6b"
-    MODEL_PATH = f"{llm_models_root()}/gpt-j-6b"
-    EXAMPLE_FOLDER = "models/contrib/gptj"
-
-    def test_auto_dtype(self):
-        # float16
-        self.run(dtype='auto')
-
-    def test_float32(self):
-        self.run(dtype='float32')
-
-    @skip_pre_ada
-    def test_fp8(self):
-        self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
-
-    @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
-    def test_cyclic_kv_cache(self):
-        self.run(extra_acc_spec="max_attention_window_size=960",
-                 extra_summarize_args=["--max_attention_window_size=960"])
-
-    @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
-    def test_cyclic_kv_cache_beam_search(self):
-        self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4",
-                 extra_build_args=["--max_beam_width=4"],
-                 extra_summarize_args=[
-                     "--max_attention_window_size=960", "--num_beams=4"
-                 ])
-
-
 class TestPhi2(AccuracyTestHarness):
     MODEL_NAME = "microsoft/phi-2"
     MODEL_PATH = f"{llm_models_root()}/phi-2"
@@ -486,6 +456,9 @@ class TestTinyLlama1_1BChat(AccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(dtype='auto')
 
+    def test_float32(self):
+        self.run(dtype='float32')
+
     @pytest.mark.parametrize("precision", ["int8", "int4"])
     def test_weight_only(self, precision: str):
         quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
@@ -768,6 +741,18 @@ class TestLlama3_2_1B(AccuracyTestHarness):
             self.extra_summarize_args = [f"--gpu_weights_percent={gpu_percent}"]
             self.evaluate()
 
+    def test_cyclic_kv_cache(self):
+        self.run(extra_acc_spec="max_attention_window_size=960",
+                 extra_summarize_args=["--max_attention_window_size=960"])
+
+    @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352")
+    def test_cyclic_kv_cache_beam_search(self):
+        self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4",
+                 extra_build_args=["--max_beam_width=4"],
+                 extra_summarize_args=[
+                     "--max_attention_window_size=960", "--num_beams=4"
+                 ])
+
 
 class TestMixtral8x7B(AccuracyTestHarness):
     MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 8e20c8bd6b..36c3c7a907 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -349,9 +349,6 @@ accuracy/test_accuracy.py::TestStarcoder2_15B::test_smooth_quant_ootb
 accuracy/test_accuracy.py::TestGptNext::test_auto_dtype
 accuracy/test_accuracy.py::TestMinitron4BBase::test_auto_dtype
 accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8
-accuracy/test_accuracy.py::TestGptJ6B::test_float32
-accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
-accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
 accuracy/test_accuracy.py::TestPhi2::test_auto_dtype
 accuracy/test_accuracy.py::TestPhi2::test_tp2
 accuracy/test_accuracy.py::TestPhi3Mini4kInstruct::test_auto_dtype
@@ -388,6 +385,7 @@ accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_tp2
 accuracy/test_accuracy.py::TestLlama2_7B::test_int4_awq_pre_quantized_tp2
 accuracy/test_accuracy.py::TestLlama2_7B::test_int4_gptq_pre_quantized_tp2
 accuracy/test_accuracy.py::TestLlama2_7B::test_weight_sparsity
+accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32
 accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int8]
 accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only[int4]
 accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8]
@@ -422,6 +420,8 @@ accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8
 accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_pp2
 accuracy/test_accuracy.py::TestLlama3_2_1B::test_fp8_rowwise
 accuracy/test_accuracy.py::TestLlama3_2_1B::test_weight_streaming[1.0]
+accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
+accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
 accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2
 accuracy/test_accuracy.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights
 accuracy/test_accuracy.py::TestMixtral8x7B::test_nvfp4_pre_quantized
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index c8c2d21922..841127f486 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -112,9 +112,9 @@ l0_a10:
   - examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-disable_gemm_plugin]
   - examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] # 4 mins
   - accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min
-  - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
-  - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
   - accuracy/test_accuracy.py::TestLlama7B::test_auto_dtype # 2 mins
+  - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
+  - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
   - accuracy/test_accuracy.py::TestGpt2::test_attention_ootb
   - accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype
   - accuracy/test_accuracy.py::TestPhi2::test_auto_dtype # 2 mins
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index d131f82b49..1e5d37835a 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -123,9 +123,6 @@ l0_h100:
   - accuracy/test_accuracy.py::TestGpt2::test_cuda_graph # 1 min
   - accuracy/test_accuracy.py::TestGpt2::test_context_fmha_disabled # 1 min
   - accuracy/test_accuracy.py::TestGptNext::test_auto_dtype # 1.5 mins
-  - accuracy/test_accuracy.py::TestGptJ6B::test_float32 # 4 mins
-  - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache
-  - accuracy/test_accuracy.py::TestGptJ6B::test_cyclic_kv_cache_beam_search
   - accuracy/test_accuracy.py::TestSantacoder::test_auto_dtype # 1.5 mins
   - accuracy/test_accuracy.py::TestMamba130M::test_auto_dtype # 1 min
   - accuracy/test_accuracy.py::TestVicuna7B::test_lookahead # 5 mins
@@ -139,11 +136,14 @@ l0_h100:
   - accuracy/test_accuracy.py::TestGpt2::test_attention_ootb
   - accuracy/test_accuracy.py::TestStarcoder2_3B::test_auto_dtype
   - accuracy/test_accuracy.py::TestMinitron4BBase::test_fp8
+  - accuracy/test_accuracy.py::TestTinyLlama1_1BChat::test_float32
   - accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context
   - accuracy/test_accuracy.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl
   - accuracy/test_accuracy.py::TestLlama3_1_8B::test_fp8
   - accuracy/test_accuracy.py::TestLlama3_1_8B::test_autoq
   - accuracy/test_accuracy.py::TestLlama3_2_1B::test_auto_dtype
+  - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache
+  - accuracy/test_accuracy.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search
   - accuracy/test_accuracy.py::TestGemma2_9BIt::test_auto_dtype
   - examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b]
   - examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]