diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
index 50761e7e8e..e4bda5707d 100644
--- a/examples/disaggregated/slurm/benchmark/config.yaml
+++ b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -49,9 +49,12 @@ profiling:
 # Accuracy Configuration
 accuracy:
   enable_accuracy_test: false  # Set to true to enable accuracy evaluation
-  model: "local-completions"  # Model type for lm_eval
-  tasks: "gsm8k"  # Evaluation tasks (comma-separated)
-  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096"  # Extra model arguments for lm_eval
+  tasks:
+    gsm8k:
+      model: "local-completions"  # Model type for lm_eval
+      model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
+      extra_kwargs:
+        trust_remote_code: true
 
 worker_config:
   gen:
diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml
index 2f10c9707d..5c5b4441b4 100644
--- a/examples/wide_ep/slurm_scripts/config.yaml
+++ b/examples/wide_ep/slurm_scripts/config.yaml
@@ -47,9 +47,12 @@ profiling:
 # Accuracy Configuration
 accuracy:
   enable_accuracy_test: false  # Set to true to enable accuracy evaluation
-  model: "local-completions"  # Model type for lm_eval
-  tasks: "gsm8k"  # Evaluation tasks (comma-separated)
-  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096"  # Extra model arguments for lm_eval
+  tasks:
+    gsm8k:
+      model: "local-completions"  # Model type for lm_eval
+      model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
+      extra_kwargs:
+        trust_remote_code: true
 
 worker_config:
   gen: