diff --git a/requirements-dev.txt b/requirements-dev.txt
index 23de534177..5fbd9067bc 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -28,7 +28,7 @@ jieba==0.42.1
 rouge==1.0.1
 pytest-rerunfailures
 ruff==0.9.4
-lm_eval[api]==0.4.9.1  # 0.4.9.2 bug for custom config: https://github.com/EleutherAI/lm-evaluation-harness/pull/3436
+lm_eval[api]==0.4.10
 docstring_parser
 genai-perf==0.0.13
 opentelemetry-sdk>=1.26.0
@@ -39,3 +39,4 @@ fuzzywuzzy==0.18.0
 aiperf==0.3.0
 nanobind>=2.9.0
 nixl==0.8.0
+hf-transfer==0.1.9
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index 8fbb966075..882eb122b3 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -918,7 +918,8 @@ class LongBenchV1(LmEvalEvaluator):
             lm=lm_cls(llm,
                       sampling_params=sampling_params,
                       streaming=streaming,
-                      chat_template_kwargs=self.chat_template_kwargs),
+                      chat_template_kwargs=self.chat_template_kwargs,
+                      output_dir=self.output_dir),
             task_dict=self.task_dict,
             limit=self.num_samples,
             apply_chat_template=self.apply_chat_template,
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 32568ad7e8..fc45b1eb8b 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -19,7 +19,6 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_64]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[attn_backend=TRTLLM-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3::test_skip_softmax_attention_multi_gpus[target_sparsity_0.9]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 64605d88a2..ed6cc0ac4b 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -78,6 +78,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3::test_skip_softmax_attention_multi_gpus[target_sparsity_0.9] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (60)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 53266a9303..e9415ee509 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -264,11 +264,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343)
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5799901)
 test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] SKIP (https://nvbugs/5810935)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5] SKIP (https://nvbugs/5810940)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9] SKIP (https://nvbugs/5810940)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5810940)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5810940)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3::test_skip_softmax_attention_multi_gpus[target_sparsity_0.9] SKIP (https://nvbugs/5810940)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)