diff --git a/tests/integration/defs/perf/disagg/cleanup_jobs.sh b/tests/integration/defs/perf/disagg/cleanup_jobs.sh
index 61ad4b160e..d2b21becbd 100644
--- a/tests/integration/defs/perf/disagg/cleanup_jobs.sh
+++ b/tests/integration/defs/perf/disagg/cleanup_jobs.sh
@@ -23,14 +23,19 @@ echo "=========================================="
 echo "Output path: $OUTPUT_PATH"
 echo ""
 
-# Show pytest PID if available (for debugging)
+# Terminate pytest process if still running
 if [ -f "$PID_FILE" ]; then
     PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
     echo "Pytest PID: $PYTEST_PID"
 
-    # Check if pytest is still running
+    # Check if pytest is still running and kill it
     if kill -0 "$PYTEST_PID" 2>/dev/null; then
-        echo "Status: Still running"
+        echo "Status: Still running - terminating..."
+        if kill -9 "$PYTEST_PID" 2>/dev/null; then
+            echo "       [OK] Process killed"
+        else
+            echo "       [WARN] Failed to kill process (may already be gone)"
+        fi
     else
         echo "Status: Already terminated"
     fi
diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 8dec350f77..b5e2d9a83f 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -620,15 +620,8 @@ class JobManager:
             result["error"] = error_msg
             return result
 
-        # Check if required log file exists (7_accuracy_eval.log)
-        accuracy_log = os.path.join(result_dir, "7_accuracy_eval.log")
-        if not os.path.exists(accuracy_log):
-            error_msg = f"Accuracy evaluation log file not found: {accuracy_log}"
-            logger.error(error_msg)
-            result["error"] = error_msg
-            return result
-
         # Import and use AccuracyParser
+        # Note: AccuracyParser handles log file checking with glob pattern support
         from reporting.accuracy_parser import AccuracyParser
 
         accuracy_parser = AccuracyParser(metrics_config, accuracy_config, result_dir)
diff --git a/tests/integration/defs/perf/disagg/reporting/accuracy_parser.py b/tests/integration/defs/perf/disagg/reporting/accuracy_parser.py
index 1e8e28f709..7704ae8ef0 100644
--- a/tests/integration/defs/perf/disagg/reporting/accuracy_parser.py
+++ b/tests/integration/defs/perf/disagg/reporting/accuracy_parser.py
@@ -1,5 +1,6 @@
 """Accuracy test result parser."""
 
+import glob
 import os
 import re
 from typing import Dict, List
@@ -28,40 +29,77 @@ class AccuracyParser:
         self.result_dir = result_dir
 
     def parse_and_validate(self) -> AccuracyValidationResult:
-        """Parse accuracy_eval.log and validate all configured datasets for all runs.
+        """Parse accuracy_eval.log(s) and validate all configured datasets for all runs.
 
         Supports multiple runs (e.g., pre-benchmark and post-benchmark).
+        Supports wildcard patterns in log_file (e.g., "7_accuracy_eval_*.log").
         All runs must pass for the validation to succeed.
 
         Returns:
             AccuracyValidationResult with validation results for all runs
         """
-        log_file = os.path.join(self.result_dir, self.metrics_config.log_file)
+        log_pattern = self.metrics_config.log_file
 
-        if not os.path.exists(log_file):
+        # Check if pattern contains wildcards
+        if "*" in log_pattern or "?" in log_pattern or "[" in log_pattern:
+            # Use glob to match multiple files
+            log_files = sorted(glob.glob(os.path.join(self.result_dir, log_pattern)))
+
+            if not log_files:
+                return {
+                    "success": False,
+                    "all_passed": False,
+                    "runs": [],
+                    "raw_results": [],
+                    "error": f"No log files found matching pattern: {log_pattern}",
+                }
+
+            logger.info(f"Found {len(log_files)} log file(s) matching pattern '{log_pattern}'")
+        else:
+            # Single file (backward compatible)
+            log_file = os.path.join(self.result_dir, log_pattern)
+
+            if not os.path.exists(log_file):
+                return {
+                    "success": False,
+                    "all_passed": False,
+                    "runs": [],
+                    "raw_results": [],
+                    "error": f"Log file not found: {log_file}",
+                }
+
+            log_files = [log_file]
+
+        # Read and merge all log files
+        combined_log_content = ""
+        failed_files = []
+
+        for log_file in log_files:
+            try:
+                with open(log_file, "r", encoding="utf-8", errors="replace") as f:
+                    content = f.read()
+                    combined_log_content += content
+                    if not content.endswith("\n"):
+                        combined_log_content += "\n"  # Ensure separation between files
+                    logger.info(f"Successfully read log file: {os.path.basename(log_file)}")
+            except Exception as e:
+                failed_files.append((log_file, str(e)))
+                logger.warning(f"Failed to read {log_file}: {e}")
+
+        if not combined_log_content:
+            error_msg = "No valid log content found."
+            if failed_files:
+                error_msg += f" Failed files: {failed_files}"
             return {
                 "success": False,
                 "all_passed": False,
                 "runs": [],
                 "raw_results": [],
-                "error": f"Log file not found: {log_file}",
-            }
-
-        # Read log file
-        try:
-            with open(log_file, "r", encoding="utf-8", errors="replace") as f:
-                log_content = f.read()
-        except Exception as e:
-            return {
-                "success": False,
-                "all_passed": False,
-                "runs": [],
-                "raw_results": [],
-                "error": f"Failed to read log file: {e}",
+                "error": error_msg,
             }
 
         # Extract accuracy values for all runs
-        all_runs_results = self._extract_accuracy_values(log_content)
+        all_runs_results = self._extract_accuracy_values(combined_log_content)
 
         if not all_runs_results:
             return {
diff --git a/tests/integration/defs/perf/disagg/reporting/accuracy_validator.py b/tests/integration/defs/perf/disagg/reporting/accuracy_validator.py
index 14e0a1cdfb..1afe7ab959 100644
--- a/tests/integration/defs/perf/disagg/reporting/accuracy_validator.py
+++ b/tests/integration/defs/perf/disagg/reporting/accuracy_validator.py
@@ -135,7 +135,7 @@ DATASET_DEFAULTS = {
         "higher_is_better": True,
     },
     # Alias for gpqa_diamond (same task, different naming convention)
-    "gpqa_diamond_cot_zeroshot": {
+    "gpqa_diamond_local": {
         "alpha": 0.05,
         "beta": 0.2,
         "sigma": 50,
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
index 13dee8cb7a..2f94667a53 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
index 121c5c7dc6..fba25771bb 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
index e7a0ed5620..213c6ced5f 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
index bdf4ee44ab..d7279d3b21 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index 2aa6487c95..1ff8177d23 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index 5ad6248b5a..0437408d43 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
index d58e41d4ce..eb9fd647b7 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
index 8bb52a5e03..e468e51c7e 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index 3453903ce4..7f4c66a24f 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 4
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index 06fb9200a8..b6e6e3d826 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 4
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
index ba00067777..007c4b3325 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
index 6258e00ba3..d85ff79d08 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml
index 97ce35a0a7..be9dc6556d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index 95c87e6b5e..9f8ec3a9f9 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index e1036437b0..6260fc5b8c 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
index 1b9577e9ee..e3b9d07ff4 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
index e483bedba9..b483ce1c8c 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
index 5848d8e0f5..7bf0861937 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
index 5be2578220..9e6eda5459 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml
index b6299357d2..c8f368acfc 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
index 3c64d97e26..6ff5914009 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
index 320e37c4ef..7667c7903a 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index c4ec240071..8a0bc5ca82 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index 76ea2c7e67..3328a559c3 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
index 20ab0f61af..0b37895f1e 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
index cfa082f78e..856de14f2f 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
index e8dc26447f..690afbff78 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
index aacad0ebc3..115af8642d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -42,9 +42,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
index 9d5ca29c12..358d50fcaf 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -50,9 +50,14 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: true
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+  env_var:
+    HF_HOME: <hf_home_path>
+  tasks:
+    gsm8k:
+      model: "local-completions"
+      model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
+      extra_kwargs:
+        trust_remote_code: true
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
index 914924bfd2..71438a796e 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -47,9 +47,14 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: true
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+  env_var:
+    HF_HOME: <hf_home_path>
+  tasks:
+    gsm8k:
+      model: "local-completions"
+      model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
+      extra_kwargs:
+        trust_remote_code: true
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
index c237dc5f0f..0d6c3b5d77 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
@@ -10,7 +10,7 @@ metadata:
   dataset_file: disagg_datasets/kimi-k2-1024-1024-20000-ratio-1_for_serve.json
   accuracy:
     datasets:
-    - dataset_name: gpqa_diamond_cot_zeroshot
+    - dataset_name: gpqa_diamond_local
       expected_value: 0.65
       threshold_type: hypothesis_test
       filter_type: strict-match
@@ -29,7 +29,7 @@ benchmark:
   multi_round: 8
   benchmark_ratio: 1.0
   streaming: true
-  concurrency_list: '16384'
+  concurrency_list: '8192'
   input_length: 1024
   output_length: 1024
   dataset_file: <dataset_file>
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
index 3c33b288e5..82ba1fc92c 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
index 0a6135f34a..431258ab8e 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
index 75accd8631..a1cb7b2a24 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
index 2df13f962f..7711e130ca 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
index 3c0b8d2e7a..873de8b2df 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
index 2dd7fd80b2..845b694cbb 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
index fedb8825b2..93f3662b9f 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
index 5766454980..10692cc270 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
index 4d4f8cb7db..55a292499a 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
index fc12422943..5c022fa295 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
index d2f81b865e..ee04a0268d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -45,9 +45,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
index a0d6836d55..00c518c864 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
index 60b022a21d..e6203c75c7 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
index f4cfcda4e6..ff4c5276bf 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
index dafc6a7df7..c9bc7351f8 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
index bb54d661a5..4185f89449 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml
@@ -43,9 +43,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
index 58ef6a5bed..58ae5032d8 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
@@ -13,7 +13,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 03:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
@@ -44,9 +44,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false  # Set to true to enable accuracy evaluation
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
index 9da7bfd08b..c3a4bee671 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -13,7 +13,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 04:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
@@ -44,9 +44,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
index 21ab0d765e..f3dcda394e 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -14,7 +14,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 04:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
@@ -45,9 +45,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 48
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
index 5697528941..8752c59e1f 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -13,7 +13,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 04:00:00
   job_name: disaggr-test
   extra_args: "--gres=gpu:4"
   numa_bind: true
@@ -44,9 +44,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
index 7219692295..edec6340a7 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
@@ -13,7 +13,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 04:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
@@ -44,9 +44,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 16
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
index 6bfa477ad7..05dc30cf54 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@@ -13,7 +13,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 04:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
@@ -44,9 +44,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     tensor_parallel_size: 32
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
index dbba4ca355..44a756fc8d 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
@@ -41,9 +41,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL.yaml
index 6178b8e929..2584fd7908 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL.yaml
@@ -41,9 +41,6 @@ profiling:
   nsys_on: false
 accuracy:
   enable_accuracy_test: false
-  model: local-completions
-  tasks: gsm8k
-  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
 worker_config:
   gen:
     enable_layerwise_nvtx_marker: true
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
index 36e786949d..9e724c56de 100644
--- a/tests/integration/defs/perf/disagg/test_disagg.py
+++ b/tests/integration/defs/perf/disagg/test_disagg.py
@@ -112,8 +112,8 @@ class TestDisaggBenchmark:
                 )
                 assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
 
-                # Wait for completion (timeout: 10 hours = 36000 seconds)
-                JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
+                # Wait for completion (timeout: 15 hours = 54000 seconds)
+                JobManager.wait_for_completion(job_id, 54000, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()
@@ -188,8 +188,8 @@ class TestDisaggBenchmark:
                 # Validate submission result
                 assert job_id, f"Failed to get job_id for {test_config.test_id}"
 
-                # Wait for completion (timeout: 10 hours = 36000 seconds)
-                JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
+                # Wait for completion (timeout: 15 hours = 54000 seconds)
+                JobManager.wait_for_completion(job_id, 54000, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()
@@ -272,8 +272,8 @@ class TestDisaggBenchmark:
                 )
                 assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
 
-                # Wait for completion (timeout: 10 hours = 36000 seconds)
-                JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
+                # Wait for completion (timeout: 15 hours = 54000 seconds)
+                JobManager.wait_for_completion(job_id, 54000, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()
diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py
index b4aecc116d..fbd28aa534 100644
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@@ -12,7 +12,14 @@ GPU_RESOURCE_CONFIG = {
         "lock_freq_graphics_mhz": 2062,  # GPU graphics clock lock frequency (MHz)
         "lock_freq_memory_mhz": 3996,  # GPU memory clock lock frequency (MHz)
     },
-    # OCI GB300
+    # Lyris GB200
+    "GB200_LYRIS": {
+        "slurm_extra_args": "",  # GB200 does not require extra args
+        "set_segment": True,
+        "lock_freq_graphics_mhz": None,  # TODO: Set GB200 lock frequency
+        "lock_freq_memory_mhz": None,
+    },
+    # Lyris GB300
     "GB300": {
         "slurm_extra_args": "",  # GB300 does not require extra args
         "set_segment": True,
@@ -121,6 +128,10 @@ class EnvManager:
     def get_dataset_dir() -> str:
         return os.getenv("DATASET_DIR", "<Your dataset directory>")
 
+    @staticmethod
+    def get_hf_home_dir() -> str:
+        return os.getenv("HF_HOME_DIR", "<Your HF home directory>")
+
     @staticmethod
     def get_output_path() -> str:
         output_path = os.getenv(
diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py
index 567834d6a7..96cfc5f2be 100644
--- a/tests/integration/defs/perf/disagg/utils/config_loader.py
+++ b/tests/integration/defs/perf/disagg/utils/config_loader.py
@@ -87,10 +87,10 @@ class AccuracyConfig:
 # ============================================================================
 
 # Accuracy test uses accuracy_eval.log (markdown table output from lm_eval)
-# Note: Only log_file is used by AccuracyParser (accuracy_parser.py)
-# The regex pattern is hardcoded in AccuracyParser._extract_accuracy_values()
+# Note: submit.py generates separate log files for each task (e.g., 7_accuracy_eval_{task}.log)
+# Use glob pattern to automatically match all accuracy log files
 _COMMON_ACCURACY_METRICS = MetricsConfig(
-    log_file="7_accuracy_eval.log",
+    log_file="7_accuracy_eval_*.log",
     extractor_pattern=r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|",
     metric_names=["flexible-extract", "strict-match"],
 )
@@ -148,7 +148,7 @@ DEFAULT_METRICS_CONFIG = {
             "SERVER_MEDIAN_ITL",
             "SERVER_P99_ITL",
             "SERVER_MEAN_E2EL",
-            "SERVER_E2EL",  # Median E2EL (keep the same name as disagg)
+            "SERVER_MEDIAN_E2EL",  # Median E2EL (keep the same name as disagg)
             "SERVER_P99_E2EL",
         ],
     ),
@@ -230,6 +230,9 @@ class ConfigLoader:
         if gpu_type is None:
             gpu_type = EnvManager.get_gpu_type()
 
+        # GB200_LYRIS in also in the GB200 family
+        if gpu_type.startswith("GB200_"):
+            gpu_type = "GB200"
         configs = []
 
         if not self.base_dir.exists():
@@ -406,7 +409,7 @@ class ConfigLoader:
                 if "metrics" in acc_meta:
                     metrics_override = acc_meta["metrics"]
                     custom_metrics = MetricsConfig(
-                        log_file=metrics_override.get("log_file", "7_accuracy_eval.log"),
+                        log_file=metrics_override.get("log_file", "7_accuracy_eval_*.log"),
                         extractor_pattern=metrics_override.get(
                             "extractor_pattern",
                             r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|",
@@ -517,9 +520,10 @@ class ConfigLoader:
             ("slurm", "job_name"): lambda: EnvManager.get_slurm_job_name(),
             ("environment", "container_mount"): lambda: EnvManager.get_container_mount(model_name),
             ("environment", "container_image"): lambda: EnvManager.get_container_image(),
-            ("environment", "trtllm_repo"): lambda: EnvManager.get_repo_dir(),
+            ("environment", "trtllm_repo"): lambda: self._get_repo_dir(),
             ("environment", "trtllm_wheel_path"): lambda: EnvManager.get_trtllm_wheel_path(),
             ("benchmark", "dataset_file"): lambda: self._get_dataset_file(config),
+            ("accuracy", "env_var", "HF_HOME"): lambda: EnvManager.get_hf_home_dir(),
             ("environment", "work_dir"): lambda: EnvManager.get_script_dir(),
             ("environment", "model_path"): lambda: self._get_full_model_path(config),
             ("slurm", "script_file"): lambda: self._get_script_file(config),
@@ -528,11 +532,67 @@ class ConfigLoader:
         }
 
         # Apply overrides based on field paths
-        for (section, key), value_getter in field_mapping.items():
-            if section in config:
-                config[section][key] = value_getter()
+        for path, value_getter in field_mapping.items():
+            self._set_nested_value(config, path, value_getter())
+
+        # Apply dynamic overrides for accuracy.tasks (task names are dynamic)
+        self._apply_accuracy_tasks_overrides(config)
+
         return config
 
+    def _set_nested_value(self, config: dict, path: tuple, value: any) -> None:
+        """Set value at nested path in config.
+
+        Supports arbitrary nesting depth using tuple paths.
+        Creates missing intermediate levels automatically.
+
+        Args:
+            config: Configuration dictionary
+            path: Tuple of keys representing the path (e.g., ("a", "b", "c"))
+            value: Value to set
+
+        Example:
+            _set_nested_value(config, ("accuracy", "env_var", "HF_HOME"), "/path")
+            # Sets config["accuracy"]["env_var"]["HF_HOME"] = "/path"
+        """
+        current = config
+
+        # Traverse/create path, except for the last key
+        for key in path[:-1]:
+            if key not in current:
+                current[key] = {}
+            current = current[key]
+
+        # Set the final value
+        current[path[-1]] = value
+
+    def _apply_accuracy_tasks_overrides(self, config: dict) -> None:
+        """Apply environment overrides for accuracy.tasks configuration.
+
+        Handles dynamic task names (e.g., gsm8k, gpqa_diamond_local).
+        Replaces placeholders in custom_config paths.
+
+        Args:
+            config: Configuration dictionary
+        """
+        if "accuracy" not in config or "tasks" not in config["accuracy"]:
+            return
+
+        repo_dir = EnvManager.get_repo_dir()
+
+        # Iterate through all tasks (task names are dynamic)
+        for task_name, task_config in config["accuracy"]["tasks"].items():
+            if not isinstance(task_config, dict):
+                continue
+
+            # Replace <repo_path> in custom_config
+            if "extra_kwargs" in task_config and "custom_config" in task_config["extra_kwargs"]:
+                custom_config_path = task_config["extra_kwargs"]["custom_config"]
+                if "<repo_path>" in custom_config_path:
+                    task_config["extra_kwargs"]["custom_config"] = custom_config_path.replace(
+                        "<repo_path>", repo_dir
+                    )
+
     def _get_full_model_path(self, config: dict) -> str:
         """Get full model path by combining MODEL_DIR with model directory name.
 
@@ -548,6 +608,12 @@ class ConfigLoader:
         else:
             return ""
 
+    def _get_repo_dir(self):
+        if EnvManager.get_install_mode() == "source":
+            return EnvManager.get_repo_dir()
+        else:  # wheel/none install_mode, no need to set repo_dir
+            return ""
+
     def _get_dataset_file(self, config: dict) -> str:
         """Get dataset file by combining dataset directory with dataset file name.