diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 82891ca847..f6d81460fe 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -111,6 +111,7 @@ MODEL_PATH_DICT = {
     "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
     "qwen2_7b_instruct": "Qwen2-7B-Instruct",
     "qwen_14b_chat": "Qwen-14B-Chat",
+    "qwen3_0.6b": "Qwen3/Qwen3-0.6B",
     "qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
     "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
     "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml
index b8f8b1f222..2db0b307b2 100644
--- a/tests/integration/test_lists/qa/llm_perf_core.yml
+++ b/tests/integration/test_lists/qa/llm_perf_core.yml
@@ -94,7 +94,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200]
   - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200]
     #llama_v3.1_8b
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
   #mixtral_8x7b_v0.1
diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml
index 069bd02ea2..0348cef095 100644
--- a/tests/integration/test_lists/qa/llm_perf_sanity.yml
+++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml
@@ -1,6 +1,18 @@
 version: 0.0.1
 llm_perf_sanity:
-# A100, L40S, L20, H20, H100, H200, Blackwell
+# ===============================================================================
+# Test Conditions Index
+# ===============================================================================
+# 1: All GPUs
+# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
+# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
+# 6: H20, H100, H200, B200, B300, RTX6000-Server
+# 7: H20, H100, H200, B200, B300
+# ===============================================================================
+
+# 1: All GPUs
 - condition:
     ranges:
       system_gpu_count:
@@ -28,10 +40,13 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
   - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
   - perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
 
 
-# FP8 specific tests
-# A100, L40S, L20, H20, H100, H200, Blackwell
+# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
 - condition:
     terms:
       supports_fp8: true
@@ -45,94 +60,40 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
   - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
   - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
-
-# Tests for ALL systems with 2+ GPUs
-# A100, L40S, L20, H20, H100, H200, Blackwell
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 2
-  tests:
-  #llama_v3.1_8b_instruct
-  #pytorch backend
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
-
-# FP8 tests for systems with 2+ GPUs
-# A100, L40S, L20, H20, H100, H200, Blackwell
-- condition:
-    terms:
-      supports_fp8: true
-    ranges:
-      system_gpu_count:
-        gte: 2
-
-  tests:
-  #mixtral_8x7b_v0.1_fp8 pytorch backend
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
 
-# Tests for systems with 2+ GPUs and high memory
-# A100, L40S, H20, H100, H200, Blackwell
+
+# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
     ranges:
-      system_gpu_count:
-        gte: 2
       gpu_memory:
         gt: 80000
-
   tests:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
-
-# Tests for systems with 4+ GPUs
-# A100, L40S, H20, H100, H200, Blackwell
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-
-  tests:
-  #llama_v3.1_70b
-  #trt backend
-  #pytorch backend
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
-
-# FP8 specific tests
-# L40S, H20, H100, H200, Blackwell
-- condition:
-    terms:
-      supports_fp8: true
-    ranges:
-      system_gpu_count:
-        gte: 4
-
-  tests:
-  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
-
-# Tests for systems with 8+ GPUs
-# A100, L40S, H20, H100, H200, Blackwell
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 8
-      gpu_memory:
-        gt: 46000
-
-  tests:
-  #llama_v3.1_70b
-  #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
 
-# FP8 tests for systems with 8+ GPUs
-# L40S, H20, H100, H200, Blackwell
+# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+
+  tests:
+    # llama_v3.1_70b
+    # trt backend
+    # pytorch backend
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
+   # test overlap scheduler
+  - perf/test_perf.py::test_perf[qwen3_0.6b-bench-pytorch-bfloat16-maxnt:2048-input_output_len:8000,1000-reqs:256-con:1-pp:4-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:1000,1000-reqs:1000-con:200]
+
+
+# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
     terms:
       supports_fp8: true
@@ -141,17 +102,15 @@ llm_perf_sanity:
         gte: 8
 
   tests:
-  #llama_v3.1_70b
-  #trt backend
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8]
-  #llama_v3.3_70b_instruct_fp8
-  #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]
 
 
-# FP4, FP8 tests for systems with 8+ GPUs
-# H20, H100, H200, Blackwell
+# 6: H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
     ranges:
       system_gpu_count:
@@ -171,10 +130,11 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
-    # gpt_oss_20b_fp4
+  # gpt_oss_20b_fp4
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
 
-# gpu_arch > Hopper, exclude GB20X, RTX 6000 for not supported
+
+# 7: H20, H100, H200, B200, B300
 - condition:
     ranges:
       system_gpu_count: